[llvm] cad9de7 - [AMDGPU] gfx940 MAI hazard recognizer

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 24 13:00:07 PDT 2022


Author: Stanislav Mekhanoshin
Date: 2022-03-24T12:59:52-07:00
New Revision: cad9de71d7119e2b8ece14c8243a77bb0ef59a16

URL: https://github.com/llvm/llvm-project/commit/cad9de71d7119e2b8ece14c8243a77bb0ef59a16
DIFF: https://github.com/llvm/llvm-project/commit/cad9de71d7119e2b8ece14c8243a77bb0ef59a16.diff

LOG: [AMDGPU] gfx940 MAI hazard recognizer

Differential Revision: https://reviews.llvm.org/D122263

Added: 
    llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/lib/Target/AMDGPU/VOP3PInstructions.td
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 69ec39f4fdfa8..b7c52fec9a546 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -133,7 +133,10 @@ static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
     return false;
 
-  return true;
+  if (!ST.hasGFX940Insts())
+    return true;
+
+  return AMDGPU::getMAIIsGFX940XDL(Opcode);
 }
 
 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
@@ -1494,6 +1497,13 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
   for (const MachineOperand &Use : MI->explicit_uses()) {
     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
+    const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
+    const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
+    const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
+    const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
+    const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
+    const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
+    const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
@@ -1504,9 +1514,18 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
+    const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
+    const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
+    const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
+    const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
+    const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
+    const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
+    const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
+    const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
+    const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
     const int MaxWaitStates = 19;
 
     if (!Use.isReg())
@@ -1538,7 +1557,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
     unsigned Opc1 = MI1->getOpcode();
     int NeedWaitStates = 0;
     if (OpNo == SrcCIdx) {
-      if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
+      if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
         NeedWaitStates = 0;
       } else if (FullReg) {
         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
@@ -1546,6 +1565,9 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
+        else if (ST.hasGFX940Insts() &&
+                 TSchedModel.computeInstrLatency(MI1) == 2)
+          NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
       } else {
         switch (Opc1) {
         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
@@ -1561,22 +1583,42 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
           break;
         default:
+          if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
+            break;
           switch (TSchedModel.computeInstrLatency(MI1)) {
           case 2:
-            NeedWaitStates = isDGEMM(Opc)
-              ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
-              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
+            NeedWaitStates = ST.hasGFX940Insts()
+              ? isXDL(ST, *MI1)
+                ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
+                : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
+              : isDGEMM(Opc)
+                ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
+                : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
+            break;
+          case 4:
+            assert(ST.hasGFX940Insts());
+            NeedWaitStates = isXDL(ST, *MI1)
+              ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
+              : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
             break;
           case 8:
-            NeedWaitStates = isDGEMM(Opc)
-              ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
-              : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
+            NeedWaitStates = ST.hasGFX940Insts()
+              ? isXDL(ST, *MI1)
+                ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
+                : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
+              : isDGEMM(Opc)
+                ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
+                : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
             break;
           case 16: LLVM_FALLTHROUGH;
           default:
-            NeedWaitStates = isDGEMM(Opc)
-              ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
-              : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
+            NeedWaitStates = ST.hasGFX940Insts()
+              ? isXDL(ST, *MI1)
+                ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
+                : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
+              : isDGEMM(Opc)
+                ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
+                : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
           }
         }
       }
@@ -1595,14 +1637,32 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
       default:
         switch (TSchedModel.computeInstrLatency(MI1)) {
         case 2:
-          NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
+          NeedWaitStates = ST.hasGFX940Insts()
+            ? isXDL(ST, *MI1)
+              ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
+              : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
+            : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
+          break;
+        case 4:
+          assert(ST.hasGFX940Insts());
+          NeedWaitStates = isXDL(ST, *MI1)
+            ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
+            : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
           break;
         case 8:
-          NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
+          NeedWaitStates = ST.hasGFX940Insts()
+            ? isXDL(ST, *MI1)
+              ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
+              : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
+            : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
           break;
         case 16: LLVM_FALLTHROUGH;
         default:
-          NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
+          NeedWaitStates = ST.hasGFX940Insts()
+            ? isXDL(ST, *MI1)
+              ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
+              : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
+            : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
         }
       }
     }
@@ -1717,6 +1777,14 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
+    const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
+    const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
+    const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
+    const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
+    const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
+    const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
+    const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
+    const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
@@ -1756,16 +1824,30 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
       int NeedWaitStates = MaxWaitStates;
       switch (HazardDefLatency) {
       case 2:
-        NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
+        NeedWaitStates =
+          ST.hasGFX940Insts()
+            ? isXDL(ST, *MFMA)
+              ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
+              : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
+            : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
         break;
       case 4:
         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
         NeedWaitStates =
-            IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
-                          : DMFMA4x4WriteVgprVALUReadWaitStates;
+          isDGEMM(MFMA->getOpcode())
+            ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
+                            : DMFMA4x4WriteVgprVALUReadWaitStates
+            : isXDL(ST, *MFMA)
+              ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
+              : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
         break;
       case 8:
-        NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
+        NeedWaitStates =
+          ST.hasGFX940Insts()
+            ? isXDL(ST, *MFMA)
+              ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
+              : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
+            : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
         break;
       case 16: LLVM_FALLTHROUGH;
       default:
@@ -1773,7 +1855,11 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
           isDGEMM(MFMA->getOpcode())
             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
                             : DMFMA16x16WriteVgprVALUReadWaitStates
-            : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
+            : ST.hasGFX940Insts()
+              ? isXDL(ST, *MFMA)
+                ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
+                : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
+              : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
         break;
       }
 
@@ -1803,7 +1889,16 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
+    const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
+    const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
+    const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
+    const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
+    const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
+    const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
+    const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
+    const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
+    const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
@@ -1828,19 +1923,35 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
       int NeedWaitStates = MaxWaitStates;
       switch (TSchedModel.computeInstrLatency(MFMA)) {
       case 2:
-        NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
+        NeedWaitStates = ST.hasGFX940Insts()
+          ? isXDL(ST, *MFMA)
+            ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
+            : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
+          : SMFMA4x4WriteVgprVALUWawWaitStates;
         break;
       case 4:
-        assert(isDGEMM(MFMA->getOpcode()));
-        NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
+        assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
+        NeedWaitStates = isDGEMM(MFMA->getOpcode())
+            ? DMFMA4x4WriteVgprVALUWriteWaitStates
+            : isXDL(ST, *MFMA)
+              ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
+              : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
         break;
       case 8:
-        NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
+        NeedWaitStates = ST.hasGFX940Insts()
+          ? isXDL(ST, *MFMA)
+            ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
+            : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
+          : SMFMA16x16WriteVgprVALUWawWaitStates;
         break;
       case 16: LLVM_FALLTHROUGH;
       default:
         NeedWaitStates = isDGEMM(MFMA->getOpcode())
                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
+                   : ST.hasGFX940Insts()
+                     ? isXDL(ST, *MFMA)
+                       ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
+                       : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
                    : SMFMA32x32WriteVgprVALUWawWaitStates;
         break;
       }
@@ -1858,6 +1969,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
           !MI.readsRegister(Reg, &TRI))
         return false;
 
+      if (ST.hasGFX940Insts() && !isXDL(ST, MI))
+        return false;
+
       const MachineOperand *SrcC =
           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
       assert(SrcC);
@@ -1879,6 +1993,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
     switch (HazardDefLatency) {
     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
              break;
+    case 4:  assert(ST.hasGFX940Insts());
+             NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
+             break;
     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
              break;
     case 16: LLVM_FALLTHROUGH;

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b2dc9e1528662..0095b8f5dd9db 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -164,6 +164,7 @@ unsigned getHostcallImplicitArgPosition() {
 #define GET_MIMGBiasMappingTable_IMPL
 #define GET_MIMGOffsetMappingTable_IMPL
 #define GET_MIMGG16MappingTable_IMPL
+#define GET_MAIInstInfoTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -342,6 +343,11 @@ bool getVOP3IsSingle(unsigned Opc) {
   return Info ? Info->IsSingle : false;
 }
 
+bool getMAIIsGFX940XDL(unsigned Opc) {
+  const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
+  return Info ? Info->is_gfx940_xdl : false;
+}
+
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b4d359eae9e30..e020c9cc9733f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -68,12 +68,18 @@ struct GcnBufferFormatInfo {
   unsigned DataFormat;
 };
 
+struct MAIInstInfo {
+  uint16_t Opcode;
+  bool is_gfx940_xdl;
+};
+
 #define GET_MIMGBaseOpcode_DECL
 #define GET_MIMGDim_DECL
 #define GET_MIMGEncoding_DECL
 #define GET_MIMGLZMapping_DECL
 #define GET_MIMGMIPMapping_DECL
 #define GET_MIMGBiASMapping_DECL
+#define GET_MAIInstInfoTable_DECL
 #include "AMDGPUGenSearchableTables.inc"
 
 namespace IsaInfo {
@@ -444,6 +450,9 @@ bool getVOP2IsSingle(unsigned Opc);
 LLVM_READONLY
 bool getVOP3IsSingle(unsigned Opc);
 
+LLVM_READONLY
+bool getMAIIsGFX940XDL(unsigned Opc);
+
 LLVM_READONLY
 const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
                                                   uint8_t NumComponents,

diff  --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f17f56d7c3bf4..66e02966e34f5 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -484,50 +484,59 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
   } // End isMoveImm = 1
 } // End isAsCheapAsAMove = 1, isReMaterializable = 1
 
+class MAIInst<string OpName, VOPProfile P, SDPatternOperator node>
+  : VOP3InstBase<OpName, P, node> {
+  Instruction Opcode = !cast<Instruction>(NAME);
+  bit is_gfx940_xdl = 0;
+}
+
 multiclass MAIInst<string OpName, string P, SDPatternOperator node,
                    bit NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap> {
   let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
     // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
     let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
-      defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
+      def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
                          !if(NoDstOverlap, null_frag, AgprMAIFrag<node>)>,
-                MFMATable<0, NAME # "_e64">;
+                 MFMATable<0, NAME # "_e64">;
 
       let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
-      defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
-                              !if(NoDstOverlap, null_frag, VgprMAIFrag<node>)>,
-                     MFMATable<0, NAME # "_vgprcd_e64">;
+      def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
+                                !if(NoDstOverlap, null_frag, VgprMAIFrag<node>)>,
+                        MFMATable<0, NAME # "_vgprcd_e64">;
     }
 
     foreach _ = BoolToList<NoDstOverlap>.ret in {
       let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
           isConvertibleToThreeAddress = NoDstOverlap,
           Mnemonic = OpName in {
-        defm "_mac" : VOP3Inst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), AgprMAIFrag<node>>,
-                      MFMATable<1, NAME # "_e64">;
+        def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), AgprMAIFrag<node>>,
+                         MFMATable<1, NAME # "_e64">;
 
         let SubtargetPredicate = isGFX90APlus in
-        defm _mac_vgprcd : VOP3Inst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
-                                    VgprMAIFrag<node>>,
-                           MFMATable<1, NAME # "_vgprcd_e64">;
+        def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
+                                      VgprMAIFrag<node>>,
+                              MFMATable<1, NAME # "_vgprcd_e64">;
       }
     }
   } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
 }
 
 defm V_MFMA_F32_4X4X1F32    : MAIInst<"v_mfma_f32_4x4x1f32",    "F32_F32_X4",    int_amdgcn_mfma_f32_4x4x1f32>;
-defm V_MFMA_F32_4X4X4F16    : MAIInst<"v_mfma_f32_4x4x4f16",    "F32_V4F16_X4",  int_amdgcn_mfma_f32_4x4x4f16>;
-defm V_MFMA_I32_4X4X4I8     : MAIInst<"v_mfma_i32_4x4x4i8",     "I32_I32_X4",    int_amdgcn_mfma_i32_4x4x4i8>;
 defm V_MFMA_F32_16X16X1F32  : MAIInst<"v_mfma_f32_16x16x1f32",  "F32_F32_X16",   int_amdgcn_mfma_f32_16x16x1f32>;
 defm V_MFMA_F32_16X16X4F32  : MAIInst<"v_mfma_f32_16x16x4f32",  "F32_F32_X4",    int_amdgcn_mfma_f32_16x16x4f32>;
+defm V_MFMA_F32_32X32X1F32  : MAIInst<"v_mfma_f32_32x32x1f32",  "F32_F32_X32",   int_amdgcn_mfma_f32_32x32x1f32>;
+defm V_MFMA_F32_32X32X2F32  : MAIInst<"v_mfma_f32_32x32x2f32",  "F32_F32_X16",   int_amdgcn_mfma_f32_32x32x2f32>;
+
+let is_gfx940_xdl = 1 in {
+defm V_MFMA_F32_4X4X4F16    : MAIInst<"v_mfma_f32_4x4x4f16",    "F32_V4F16_X4",  int_amdgcn_mfma_f32_4x4x4f16>;
+defm V_MFMA_I32_4X4X4I8     : MAIInst<"v_mfma_i32_4x4x4i8",     "I32_I32_X4",    int_amdgcn_mfma_i32_4x4x4i8>;
 defm V_MFMA_F32_16X16X4F16  : MAIInst<"v_mfma_f32_16x16x4f16",  "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>;
 defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4",  int_amdgcn_mfma_f32_16x16x16f16>;
 defm V_MFMA_I32_16X16X4I8   : MAIInst<"v_mfma_i32_16x16x4i8",   "I32_I32_X16",   int_amdgcn_mfma_i32_16x16x4i8>;
-defm V_MFMA_F32_32X32X1F32  : MAIInst<"v_mfma_f32_32x32x1f32",  "F32_F32_X32",   int_amdgcn_mfma_f32_32x32x1f32>;
-defm V_MFMA_F32_32X32X2F32  : MAIInst<"v_mfma_f32_32x32x2f32",  "F32_F32_X16",   int_amdgcn_mfma_f32_32x32x2f32>;
 defm V_MFMA_F32_32X32X4F16  : MAIInst<"v_mfma_f32_32x32x4f16",  "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>;
 defm V_MFMA_F32_32X32X8F16  : MAIInst<"v_mfma_f32_32x32x8f16",  "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>;
 defm V_MFMA_I32_32X32X4I8   : MAIInst<"v_mfma_i32_32x32x4i8",   "I32_I32_X32",   int_amdgcn_mfma_i32_32x32x4i8>;
+}
 
 let Predicates = [isGFX908orGFX90A] in {
 defm V_MFMA_I32_16X16X16I8  : MAIInst<"v_mfma_i32_16x16x16i8",  "I32_I32_X4",    int_amdgcn_mfma_i32_16x16x16i8>;
@@ -542,27 +551,29 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
 } // End SubtargetPredicate = HasMAIInsts
 
 let Predicates = [isGFX90APlus] in {
+  let is_gfx940_xdl = 1 in {
   defm V_MFMA_F32_32X32X4BF16_1K  : MAIInst<"v_mfma_f32_32x32x4bf16_1k",  "F32_V4I16_X32",  int_amdgcn_mfma_f32_32x32x4bf16_1k>;
   defm V_MFMA_F32_16X16X4BF16_1K  : MAIInst<"v_mfma_f32_16x16x4bf16_1k",  "F32_V4I16_X16",  int_amdgcn_mfma_f32_16x16x4bf16_1k>;
   defm V_MFMA_F32_4X4X4BF16_1K    : MAIInst<"v_mfma_f32_4x4x4bf16_1k",    "F32_V4I16_X4",   int_amdgcn_mfma_f32_4x4x4bf16_1k>;
   defm V_MFMA_F32_32X32X8BF16_1K  : MAIInst<"v_mfma_f32_32x32x8bf16_1k",  "F32_V4I16_X16",  int_amdgcn_mfma_f32_32x32x8bf16_1k>;
   defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4",   int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+  }
 
   defm V_MFMA_F64_16X16X4F64      : MAIInst<"v_mfma_f64_16x16x4f64",      "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>;
   defm V_MFMA_F64_4X4X4F64        : MAIInst<"v_mfma_f64_4x4x4f64",        "F64_4X4X4F64",   int_amdgcn_mfma_f64_4x4x4f64>;
 } // End Predicates = [isGFX90APlus]
 
-let Predicates = [isGFX940Plus] in {
+let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in {
   defm V_MFMA_I32_32X32X16I8       : MAIInst<"v_mfma_i32_32x32x16i8",       "I32_I64_X32",    int_amdgcn_mfma_i32_32x32x16_i8>;
   defm V_MFMA_I32_16X16X32I8       : MAIInst<"v_mfma_i32_16x16x32i8",       "I32_I64_X16",    int_amdgcn_mfma_i32_16x16x32_i8>;
   defm V_MFMA_F32_16X16X8XF32      : MAIInst<"v_mfma_f32_16x16x8xf32",      "F32_V2F32_X16",  int_amdgcn_mfma_f32_16x16x8_xf32>;
   defm V_MFMA_F32_32X32X4XF32      : MAIInst<"v_mfma_f32_32x32x4xf32",      "F32_V2F32_X32",  int_amdgcn_mfma_f32_32x32x4_xf32>;
-} // End Predicates = [isGFX940Plus]
+} // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1
 
 multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
   let Constraints = "$vdst = $src2", DisableEncoding = "$src2",
-      isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
-    defm NAME : VOP3Inst<OpName, !cast<VOPProfileSMFMAC>("VOPProfileSMFMAC_" # P), node>;
+      isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1, is_gfx940_xdl = 1 in {
+    def _e64 : MAIInst<OpName, !cast<VOPProfileSMFMAC>("VOPProfileSMFMAC_" # P), node>;
   }
 }
 
@@ -575,6 +586,17 @@ defm V_SMFMAC_I32_16X16X64_I8      : SMFMACInst<"v_smfmac_i32_16x16x64_i8",
 defm V_SMFMAC_I32_32X32X32_I8      : SMFMACInst<"v_smfmac_i32_32x32x32_i8",      "I32_32X32X32_I8",  int_amdgcn_smfmac_i32_32x32x32_i8>;
 }
 
+def MAIInstInfoTable : GenericTable {
+  let FilterClass = "MAIInst";
+  let CppTypeName = "MAIInstInfo";
+  let Fields = [
+    "Opcode", "is_gfx940_xdl"
+  ];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getMAIInstInfoHelper";
+}
+
 let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
   defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
   defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index a943e5f414551..71cc85a24aa35 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -390,6 +390,7 @@ bb:
 ; GFX908_A:      v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
 ; GFX908_A-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
 ; GFX940:        v_mfma_f32_4x4x1_16b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GFX940-NEXT:   s_nop 1
 ; GFX940-NEXT:   v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(<4 x float> addrspace(1)* %arg) #0 {
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
new file mode 100644
index 0000000000000..d335cf73f510a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -0,0 +1,2018 @@
+# RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: valu_write_vgpr_sgemm_mfma_read
+# GCN:      V_MOV_B32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            valu_write_vgpr_sgemm_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: valu_write_agpr_sgemm_mfma_read
+# GCN:      V_ACCVGPR_WRITE_B32_e64
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            valu_write_agpr_sgemm_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = IMPLICIT_DEF
+    $agpr4 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr4, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: valu_write_vgpr_dgemm_mfma_read
+# GCN:      V_MOV_B32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            valu_write_vgpr_dgemm_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: valu_write_vgpr_smfmac_read
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_SMFMAC
+name:            valu_write_vgpr_smfmac_read
+body:             |
+  bb.0:
+    $vgpr32 = V_MOV_B32_e32 1, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: accmov_write_agpr_sgemm_mfma_read
+# GCN:      V_ACCVGPR_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            accmov_write_agpr_sgemm_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = IMPLICIT_DEF
+    $agpr4 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr4, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm4x4_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm4x4_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm4x4_mfma_write_agpr_smfmac_read_same_agpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_SMFMAC
+name:            nonxdl_sgemm4x4_mfma_write_agpr_smfmac_read_same_agpr_as_srcc
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr5, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_16X16X32I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_16X16X32I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfmac16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_SMFMAC
+# GCN-NEXT: V_SMFMAC
+name:            smfmac16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr6, $vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr6, $vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_smfmac_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_SMFMAC
+name:            sgemm4x4_mfma_write_agpr_smfmac_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_I32_16X16X4I8_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm16x16_mfma_write_agpr_xdl_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm16x16_mfma_write_agpr_xdl_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_I32_16X16X4I8_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm16x16_mfma_write_agpr_nonxdl_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm16x16_mfma_write_agpr_nonxdl_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_SMFMAC
+name:            xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_I32_16X16X4I8_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm32x32_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32_agpr33 = V_MFMA_F32_32X32X4F16_e64 $vgpr26_vgpr27, $vgpr28_vgpr29, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm32x32_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X4F16_vgprcd_e64 $vgpr126_vgpr127, $vgpr128_vgpr129, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr126, $vgpr127, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_xdl_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm32x32_mfma_write_agpr_xdl_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32_agpr33 = V_MFMA_F32_32X32X1F32_e64 $vgpr26, $vgpr28, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32_agpr33 = V_MFMA_F32_32X32X1F32_e64 $vgpr26, $vgpr28, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_smfmac_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_SMFMAC
+name:            xdl_sgemm32x32_mfma_write_agpr_smfmac_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32_agpr33 = V_MFMA_F32_32X32X4F16_e64 $vgpr26_vgpr27, $vgpr28_vgpr29, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr10, $vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_sgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_sgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr10, $vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_vgpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr10, $vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_MFMA_F32_32X32X4F16_vgprcd_e64 $vgpr126_vgpr127, $vgpr128_vgpr129, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr120_vgpr121, $vgpr120_vgpr121, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_I32_16X16X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_I32_16X16X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr20_agpr21_agpr22_agpr23 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr20_agpr21_agpr22_agpr23, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr20_agpr21_agpr22_agpr23 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr20_agpr21_agpr22_agpr23, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            smfmac32x32_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_SMFMAC_I32_32X32X32_I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $mode, implicit $exec
+    $agpr20_agpr21_agpr22_agpr23 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr20_agpr21_agpr22_agpr23, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_SMFMAC
+name:            smfmac32x32_write_agpr_smfmac_srcc_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_SMFMAC_I32_32X32X32_I8_e64 $agpr0_agpr1, $agpr2_agpr3_agpr4_agpr5, $vgpr2, 0, 0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_SMFMAC_I32_32X32X32_I8_e64 $agpr0_agpr1, $agpr2_agpr3_agpr4_agpr5, $vgpr2, 0, 0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X4F16_e64 $vgpr26_vgpr27, $vgpr28_vgpr29, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr20_agpr21_agpr22_agpr23 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr20_agpr21_agpr22_agpr23, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X1F32_e64 $vgpr26, $vgpr28, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr120_agpr121_agpr122_agpr123 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr120_agpr121_agpr122_agpr123, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr0, $agpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_vgpr_dmfma4x4_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_vgpr_dmfma4x4_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_vgpr_dmfma16x16_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_vgpr_dmfma16x16_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_vgpr_smfmac_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_SMFMAC
+name:            xdl_sgemm4x4_mfma_write_vgpr_smfmac_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr12_vgpr13_vgpr14_vgpr15, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr2_vgpr3, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_vgpr_dgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_vgpr_dgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $agpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_smfmac_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_SMFMAC
+name:            dgemm4x4_mfma_write_vgpr_smfmac_srcb_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr10_vgpr11, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_smfmac_srcc_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_SMFMAC
+name:            dgemm4x4_mfma_write_vgpr_smfmac_srcc_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15, $vgpr2, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr0_vgpr1, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_SMFMAC
+name:            dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr10_vgpr11, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcc_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_SMFMAC
+name:            dgemm16x16_mfma_write_vgpr_smfmac_srcc_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15, $vgpr2, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_vgpr_smfmac_srcc_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_SMFMAC
+name:            xdl_sgemm4x4_mfma_write_vgpr_smfmac_srcc_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15, $vgpr1, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_vm_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            xdl_smfma4x4_write_vgpr_vm_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            xdl_smfma4x4_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_lds_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: DS_WRITE_B32
+name:            xdl_smfma4x4_write_vgpr_lds_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B32 $vgpr0, $vgpr4, 0, 0, implicit $m0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_exp_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: EXP_DONE
+name:            xdl_smfma4x4_write_vgpr_exp_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    EXP_DONE 12, $vgpr4, $vgpr0, $vgpr0, $vgpr0, 0, 0, 15, implicit $exec
+...
+# GCN-LABEL: name: smfmac16x16_write_vgpr_flat_read
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            smfmac16x16_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $vgpr0_vgpr1_vgpr2_vgpr3, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr16_vgpr17, $vgpr1, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            xdl_smfma16x16_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr16_vgpr17, $vgpr1, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: smfmac32x32_write_vgpr_flat_read
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            smfmac32x32_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_SMFMAC_I32_32X32X32_I8_e64 $agpr0_agpr1, $agpr2_agpr3_agpr4_agpr5, $vgpr2, 0, 0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr16_vgpr17, $vgpr1, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            xdl_smfma32x32_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X4F16_e64 $vgpr26_vgpr27, $vgpr28_vgpr29, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr16_vgpr17, $agpr1, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma4x4_write_vgpr_flat_read_overlap
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma4x4_write_vgpr_flat_read_full
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma16x16_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MOV_B32
+name:            xdl_smfma4x4_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            xdl_smfma16x16_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            xdl_smfma32x32_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_MFMA_F32_32X32X4F16_vgprcd_e64 $vgpr126_vgpr127, $vgpr128_vgpr129, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MOV_B32
+name:            dmfma4x4_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 $vgpr5, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            dmfma16x16_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr12 = V_MOV_B32_e32 $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_accv_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            xdl_smfma4x4_write_vgpr_accv_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_accv_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            xdl_smfma16x16_write_vgpr_accv_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_accv_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            xdl_smfma32x32_write_vgpr_accv_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_MFMA_F32_32X32X4F16_vgprcd_e64 $vgpr126_vgpr127, $vgpr128_vgpr129, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_dot_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_DOT
+name:            xdl_smfma4x4_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_dot_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_DOT
+name:            dmfma4x4_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr5, $vgpr1, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dmfma16x16_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr4, $vgpr1, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MOV_B32
+name:            xdl_smfma4x4_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr4, $vgpr0, $vgpr6_vgpr7_vgpr8_vgpr9, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            xdl_smfma16x16_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            xdl_smfma32x32_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_MFMA_F32_32X32X4F16_vgprcd_e64 $vgpr126_vgpr127, $vgpr128_vgpr129, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_valu_f16_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_FMA_F16_e64
+name:            xdl_smfma4x4_write_vgpr_valu_f16_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_FMA_F16_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_f16_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_FMA_F16_e64
+name:            xdl_smfma16x16_write_vgpr_valu_f16_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_FMA_F16_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_f16_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_FMA_F16_e64
+name:            xdl_smfma32x32_write_vgpr_valu_f16_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_MFMA_F32_32X32X4F16_vgprcd_e64 $vgpr126_vgpr127, $vgpr128_vgpr129, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_FMA_F16_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_valu_sdwa_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MOV_B32_sdwa
+name:            xdl_smfma4x4_write_vgpr_valu_sdwa_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_sdwa 0, $vgpr16, 0, 5, 2, 4, implicit $exec, implicit $vgpr1(tied-def 0)
+...
+# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_sdwa_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32_sdwa
+name:            xdl_smfma16x16_write_vgpr_valu_sdwa_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_sdwa 0, $vgpr16, 0, 5, 2, 4, implicit $exec, implicit $vgpr1(tied-def 0)
+...
+# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_sdwa_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32_sdwa
+name:            xdl_smfma32x32_write_vgpr_valu_sdwa_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_MFMA_F32_32X32X4F16_vgprcd_e64 $vgpr126_vgpr127, $vgpr128_vgpr129, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_sdwa 0, $vgpr16, 0, 5, 2, 4, implicit $exec, implicit $vgpr1(tied-def 0)
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MOV_B32
+name:            dmfma4x4_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            dmfma16x16_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_accv_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_ACCVGPR_READ_B32_e64
+name:            xdl_smfma4x4_write_vgpr_accv_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_smfma4x4_write_vgpr_dot_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_DOT
+name:            xdl_smfma4x4_write_vgpr_dot_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_smfma4x4_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_smfma4x4_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr8, $vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma16x16_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma32x32_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr0, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_read_srca_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma4x4_read_srca_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr8, $vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_read_srca_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma16x16_read_srca_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr18 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_read_srca_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma32x32_read_srca_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr18 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_read_srcb_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma4x4_read_srcb_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr8, $vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_read_srcb_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma16x16_read_srcb_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr19 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_read_srcb_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma32x32_read_srcb_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr19 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            dmfma4x4_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            dmfma16x16_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_read_srcc_vgpr_accv_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            smfma16x16_read_srcc_vgpr_accv_write
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_F32_16X16X1F32_e64 $agpr18, $agpr19, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm_to_fma64
+# GCN:      V_MFMA
+# GCN-NEXT: V_FMA_F64_e64
+name:            sgemm_to_fma64
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_to_fma64
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_FMA_F64_e64
+name:            dgemm_to_fma64
+body:             |
+  bb.0:
+    $vgpr0_vgpr1 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_to_fmac64
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_FMAC_F64
+name:            dgemm_to_fmac64
+body:             |
+  bb.0:
+    $vgpr0_vgpr1 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: flat_store_data_agpr_overwritten
+# GCN:      FLAT_STORE_DWORDX4
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name: flat_store_data_agpr_overwritten
+body: |
+  bb.0:
+    FLAT_STORE_DWORDX4 $vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_accv_read
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dot_write_vgpr_accv_read
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: valu_write_vgpr_dot_read
+# GCN:      V_MOV_B32
+# GCN-NEXT: V_DOT
+name:            valu_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: accv_write_vgpr_dot_read
+# GCN:      V_ACCVGPR_READ
+# GCN-NEXT: V_DOT
+name:            accv_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_same_dot_read_srcc
+# GCN:      V_DOT
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_same_dot_read_srcc
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_
diff erent_dot_read_srcc
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_
diff erent_dot_read_srcc
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_
diff erent_dot_write
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_
diff erent_dot_write
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_
diff erent_valu_read
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32_e32
+name:            dot_write_vgpr_
diff erent_valu_read
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_
diff erent_valu_write
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32_e32
+name:            dot_write_vgpr_
diff erent_valu_write
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_same_dot_read_srca
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_same_dot_read_srca
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr0 = V_DOT4C_I32_I8_e32 $vgpr4, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_same_dot_read_srcb
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_same_dot_read_srcb
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr0 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr4, $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: vcmpx_write_exec_mfma
+# GCN:      V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            vcmpx_write_exec_mfma
+body:             |
+  bb.0:
+    implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $agpr8, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: valu_write_agpr_dgemm_mfma_read
+# GCN:      V_ACCVGPR_WRITE_B32_e64
+# GCN:      V_ACCVGPR_WRITE_B32_e64
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            valu_write_agpr_dgemm_mfma_read
+body:             |
+  bb.0:
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    $agpr1 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $agpr10_agpr11, $agpr10_agpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $agpr10_agpr11, $agpr10_agpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+body:             |
+  bb.0:
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $vgpr0_vgpr1, $agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $vgpr0_vgpr1, $agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_agpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr10, $vgpr11, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_I32_16X16X4I8_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X4F16_e64 $vgpr26_vgpr27, $vgpr28_vgpr29, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr120_vgpr121, $vgpr120_vgpr121, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_agpr_dmfma4x4_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_agpr_dmfma4x4_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_agpr_dmfma16x16_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_agpr_dmfma16x16_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $agpr2_agpr3, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_sgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_sgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr4_agpr5 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $agpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $agpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_agpr_dgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_agpr_dgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr4, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $agpr2_agpr3, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $agpr0_agpr1, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma4x4_write_agpr_flat_read_overlap
+body:             |
+  bb.0:
+    $agpr4_agpr5 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr5, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma4x4_write_agpr_flat_read_full
+body:             |
+  bb.0:
+    $agpr4_agpr5 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $agpr4_agpr5, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma16x16_write_agpr_flat_read
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr4, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma4x4_write_agpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_ACCVGPR_READ_B32_e64
+name:            dmfma4x4_write_agpr_valu_read
+body:             |
+  bb.0:
+    $agpr4_agpr5 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_READ_B32_e64
+name:            dmfma16x16_write_agpr_valu_read
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_write_agpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dmfma4x4_write_agpr_valu_write
+body:             |
+  bb.0:
+    $agpr4_agpr5 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr4 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dmfma16x16_write_agpr_valu_write
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_read_srcc_agpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dmfma4x4_read_srcc_agpr_valu_write
+body:             |
+  bb.0:
+    $agpr4_agpr5 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_read_srcc_agpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dmfma16x16_read_srcc_agpr_valu_write
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_accvgr_to_fma64
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_FMA_F64_e64
+name:            dgemm_accvgr_to_fma64
+body:             |
+  bb.0:
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_accvgr_to_fmac64
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_FMAC_F64
+name:            dgemm_accvgr_to_fmac64
+body:             |
+  bb.0:
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16X16X16_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm16X16X16_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_F32_16X16X16F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_16X16X16F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16X16X32_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm16X16X32_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_I32_16X16X32I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_16X16X32I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16X16X16_mfma_write_agpr_dgemm_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm16X16X16_mfma_write_agpr_dgemm_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5 = V_MFMA_F32_16X16X16F16_vgprcd_e64 $vgpr8_vgpr9, $vgpr8_vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr6_vgpr7 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16X16X16_mfma_write_agpr_smfmac_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_SMFMAC
+name:            sgemm16X16X16_mfma_write_agpr_smfmac_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_F32_16X16X16F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfmac16x16_write_agpr_smfmac_read_overlap
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_SMFMAC
+name:            smfmac16x16_write_agpr_smfmac_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr2_agpr3_agpr4_agpr5, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_16X16X32I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_16X16X32I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $agpr1, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_16X16X32I8_vgprcd_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MOV_B32
+name:            xdl_sgemm16X16X16_mfma_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_16X16X32I8_vgprcd_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_vm_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            xdl_sgemm16X16X16_mfma_write_vgpr_vm_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_16X16X32I8_vgprcd_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MOV_B32
+name:            xdl_sgemm16X16X16_mfma_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_16X16X32I8_vgprcd_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_dot_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_DOT
+name:            xdl_sgemm16X16X16_mfma_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_16X16X32I8_vgprcd_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: smfmac16x16x32_write_agpr_mfma_read_same_agpr_as_srcc
+# GCN:      V_SMFMAC
+# GCN-NEXT: V_SMFMAC
+name:            smfmac16x16x32_write_agpr_mfma_read_same_agpr_as_srcc
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfmac32x32x32_write_agpr_mfma_read_same_agpr_as_srcc
+# GCN:      V_SMFMAC
+# GCN-NEXT: V_SMFMAC
+name:            smfmac32x32x32_write_agpr_mfma_read_same_agpr_as_srcc
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_SMFMAC_I32_32X32X32_I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_SMFMAC_I32_32X32X32_I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfmac16x16x32_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_SMFMAC
+name:            smfmac16x16x32_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3, implicit $mode, implicit $exec
+    $agpr2_agpr3_agpr4_agpr5 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr2_agpr3_agpr4_agpr5, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_SMFMAC
+name:            smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_SMFMAC_F32_32X32X16_BF16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $mode, implicit $exec
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_SMFMAC_F32_32X32X16_BF16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_SMFMAC
+name:            smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx
+body:             |
+  bb.0:
+    $vgpr6_vgpr7_vgpr8_vgpr9 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $vgpr6_vgpr7_vgpr8_vgpr9, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr6, 0, 0, $vgpr0_vgpr1_vgpr2_vgpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_smfmac16x16x32_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: V_SMFMAC
+name:            dgemm4x4_mfma_write_vgpr_smfmac16x16x32_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_SMFMAC_F32_16X16X32_BF16_e64 $vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15, $vgpr32, 0, 0, $vgpr0_vgpr1_vgpr2_vgpr3, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfmai8_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfmai8_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_16X16X32I8_vgprcd_e64 $vgpr10_vgpr11, $vgpr12_vgpr13, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfmaxf32_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfmaxf32_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_16X16X8XF32_vgprcd_e64 $vgpr10_vgpr11, $vgpr12_vgpr13, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm4x4_mfma_write_agpr_nonxdl_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm4x4_mfma_write_agpr_nonxdl_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm4x4_mfma_write_agpr_xdl_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm4x4_mfma_write_agpr_xdl_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm4x4_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_I32_4X4X4I8_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            nonxdl_sgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_smfma4x4_write_vgpr_vm_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            nonxdl_smfma4x4_write_vgpr_vm_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_smfma4x4_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_smfma4x4_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_smfma4x4_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_smfma4x4_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr6_vgpr7_vgpr8_vgpr9, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_vm_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            nonxdl_8pass_smfma16x16_write_vgpr_vm_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_8pass_smfma16x16_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_8pass_smfma16x16_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_vm_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            nonxdl_smfma32x32_write_vgpr_vm_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_MFMA_F32_32X32X1F32_vgprcd_e64 $agpr26, $agpr28, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_smfma32x32_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_MFMA_F32_32X32X1F32_vgprcd_e64 $agpr26, $agpr28, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_smfma32x32_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_MFMA_F32_32X32X1F32_vgprcd_e64 $agpr26, $agpr28, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm16x16_4pass_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            xdl_sgemm16x16_4pass_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_I32_16X16X32I8_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_16X16X8XF32_e64 $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfmac16x16_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            smfmac16x16_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $agpr2_agpr3_agpr4_agpr5, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_16X16X8XF32_e64 $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm4x4_mfma_read_vgpr_srcc_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MOV_B32
+name:            xdl_sgemm4x4_mfma_read_vgpr_srcc_valu_write
+body:             |
+  bb.0:
+    $vgpr10_vgpr11_vgpr12_vgpr13 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm4x4_mfma_read_vgpr_srcc_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_sgemm4x4_mfma_read_vgpr_srcc_valu_write
+body:             |
+  bb.0:
+    $vgpr10_vgpr11_vgpr12_vgpr13 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+...
+# GCN-LABEL: name: xdl_4pass_sgemm16x16_mfma_read_vgpr_srcc_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            xdl_4pass_sgemm16x16_mfma_read_vgpr_srcc_valu_write
+body:             |
+  bb.0:
+    $vgpr10_vgpr11_vgpr12_vgpr13 = V_MFMA_F32_16X16X8XF32_vgprcd_e64 $vgpr10_vgpr11, $vgpr12_vgpr13, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+...
+# GCN-LABEL: name: smfmac16x16_read_vgpr_srcc_valu_write
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MOV_B32
+name:            smfmac16x16_read_vgpr_srcc_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_SMFMAC_F32_16X16X32_F16_e64 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr32, 0, 0, $vgpr0_vgpr1_vgpr2_vgpr3, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+...
+# GCN-LABEL: name: xdl_8pass_sgemm16x16_mfma_read_vgpr_srcc_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MOV_B32
+name:            xdl_8pass_sgemm16x16_mfma_read_vgpr_srcc_valu_write
+body:             |
+  bb.0:
+    $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115 = V_MFMA_I32_16X16X4I8_vgprcd_e64 $agpr26, $agpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_8pass_sgemm16x16_mfma_read_vgpr_srcc_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_8pass_sgemm16x16_mfma_read_vgpr_srcc_valu_write
+body:             |
+  bb.0:
+    $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $agpr26, $agpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+...
+# GCN-LABEL: name: smfmac32x32_read_vgpr_srcc_valu_write
+# GCN:      V_SMFMAC
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            smfmac32x32_read_vgpr_srcc_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_SMFMAC_I32_32X32X32_I8_e64 $agpr0_agpr1, $agpr2_agpr3_agpr4_agpr5, $vgpr2, 0, 0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+...
+# GCN-LABEL: name: xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MOV_B32
+name:            xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
+body:             |
+  bb.0:
+    $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_MFMA_F32_32X32X4F16_vgprcd_e64 $agpr126_agpr127, $agpr128_agpr129, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+...
+# GCN-LABEL: name: nonxdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            nonxdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
+body:             |
+  bb.0:
+    $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_MFMA_F32_32X32X1F32_vgprcd_e64 $agpr26, $agpr28, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+...


        


More information about the llvm-commits mailing list