[llvm-branch-commits] [llvm] [AMDGPU] Handle gfx1251 wmma hazard (PR #203145)

Stanislav Mekhanoshin via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Jun 10 18:47:38 PDT 2026


https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/203145

>From 4b97a8d80692d327390d5de2c185f3d36e40791f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 10 Jun 2026 18:08:52 -0700
Subject: [PATCH] [AMDGPU] Handle gfx1251 wmma hazard

Generic target affected too in a pessimistic way.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |    8 +-
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |   43 +-
 .../AMDGPU/wmma-coexecution-valu-hazards.mir  |  895 ++++++++++
 .../AMDGPU/wmma-hazards-gfx1250-w32.mir       | 1537 +++++++++++++++++
 .../AMDGPU/wmma-hazards-gfx1251-w32.mir       |   42 +
 5 files changed, 2514 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1251-w32.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 5961ef8967b67..c6b54c52817c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -982,6 +982,10 @@ defm GFX1251GEMMInsts : AMDGPUSubtargetFeature<"gfx1251-gemm-insts",
   "Has additional gfx1251 DGEMM instructions"
 >;
 
+defm GFX125xLowestRateWMMA : AMDGPUSubtargetFeature<"gfx125x-lowest-rate-wmma",
+  "Has the lowest rate wmma in the gfx125x family, mainly for the gfx12-5-generic"
+>;
+
 defm FlatBufferGlobalAtomicFaddF64Inst : AMDGPUSubtargetFeature<"flat-buffer-global-fadd-f64-inst",
   "Has flat, buffer, and global instructions for f64 atomic fadd"
 >;
@@ -2219,6 +2223,7 @@ def FeatureISAVersion12_51 : FeatureSet<
    FeatureVOP3PX2IncrementsVaVdstTwice,
    FeatureDPALU_DPP,
    FeatureGFX1251GEMMInsts,
+   FeatureGFX125xLowestRateWMMA,
    FeatureCubeInsts,
    FeatureLerpInst,
    FeatureSadInsts,
@@ -2241,7 +2246,8 @@ def FeatureISAVersion12_5_Generic: FeatureSet<
   [FeatureAddressableLocalMemorySize327680,
    FeatureSetregVGPRMSBFixup,
    FeatureRequiresCOV6,
-   FeatureXNACKAnyOnly])>;
+   FeatureXNACKAnyOnly,
+   FeatureGFX125xLowestRateWMMA])>;
 
 def FeatureISAVersion13 : FeatureSet<
   [FeatureGFX13,
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 7ec0b2564d5a9..e2c6b25dc4319 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2091,11 +2091,27 @@ static bool isCoexecutableVALUInst(const MachineInstr &MI) {
 //
 // Category 3: SWMMAC with Latency 16
 //   SWMMAC_IU8
-static unsigned
-getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII,
-                            const TargetSchedModel &SchedModel) {
+//
+// Category 4: 16 Pass GFX1251 WMMA with latency 16
+//   V_WMMA_*_16X16X32_{F16,BF16}
+//   V_WMMA_{F32,F16}_16X16X64_{FP8,BF8}*
+//   V_WMMA_F32_16x16x128_F8F6F4 (F4 only)
+//   V_SWMMAC_*_16X16X64_{F16,BF16}
+//   V_SWMMAC_{F32,F16}_16X16X128_{FP8,BF8}*
+//
+// Category: 32 Pass GFX1251 WMMA with latency 32
+//   V_WMMA_F32_16x16x128_F8F6F4 (not all F4)
+//   V_WMMA_{F32,F16}_16X16X128_{FP8,BF8}*
+//   V_WMMA_F32_32X16X128_F4
+//   V_WMMA_I32_16X16X64_IU8
+//   V_WMMA_I32_16X16X64_IU8
+static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI,
+                                            const SIInstrInfo *TII,
+                                            const TargetSchedModel &SchedModel,
+                                            const GCNSubtarget &ST) {
   assert(TII->isXDLWMMA(MI) && "must be xdl wmma");
   bool IsSWMMAC = SIInstrInfo::isSWMMAC(MI);
+  bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
   unsigned Category = 0;
 
   unsigned Latency = SchedModel.computeInstrLatency(&MI);
@@ -2104,7 +2120,11 @@ getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII,
     Category = IsSWMMAC ? 2 : 0;
     break;
   case 16:
-    Category = IsSWMMAC ? 3 : 1;
+    Category = IsLowestRateWMMA ? 4 : (IsSWMMAC ? 3 : 1);
+    break;
+  case 32:
+    assert(IsLowestRateWMMA && "latency 32 is not expected");
+    Category = 5;
     break;
   default:
     llvm_unreachable("unexpected xdl wmma latency");
@@ -2126,15 +2146,15 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
   // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
   // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
   // numbers, which depends on the category of the first WMMA.
-  const int WMMAWaitStates[] = {5, 9, 3, 5};
-  const int VALUWaitStates[] = {4, 8, 2, 4};
+  const int WMMAWaitStates[] = {5, 9, 3, 5, 9, 17};
+  const int VALUWaitStates[] = {4, 8, 2, 4, 8, 16};
   unsigned Category = 0;
 
   auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
     if (!TII->isXDLWMMA(I))
       return false;
 
-    Category = getWMMAHazardInstInCategory(I, TII, TSchedModel);
+    Category = getWMMAHazardInstInCategory(I, TII, TSchedModel, ST);
     return hasWMMAToWMMARegOverlap(I, *MI);
   };
 
@@ -2142,7 +2162,7 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
     if (!TII->isXDLWMMA(I))
       return false;
 
-    Category = getWMMAHazardInstInCategory(I, TII, TSchedModel);
+    Category = getWMMAHazardInstInCategory(I, TII, TSchedModel, ST);
     return hasWMMAToVALURegOverlap(I, *MI);
   };
 
@@ -2152,6 +2172,7 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
 
   int WaitStatesNeeded = -1;
   int ExistingVALUs = 0; // Existing number of VALU ops in between.
+  bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
 
   // getWaitStatesSince checks for a hazard between instruction 'I' and 'MI':
   // - If a hazard exists: returns the number of VALUs in between and sets
@@ -2159,12 +2180,14 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
   // - If no hazard exists: returns INT_MAX, making WaitStatesNeeded negative,
   //   so no V_NOP insertion is needed.
   if (TII->isXDLWMMA(*MI)) {
-    const int WMMAWaitsLimit = 9; // Maximum of WMMAWaitStates
+    // Maximum of MMAWaitStates.
+    const int WMMAWaitsLimit = IsLowestRateWMMA ? 17 : 9;
     ExistingVALUs =
         getWaitStatesSince(IsWMMAHazardFn, WMMAWaitsLimit, GetWaitStatesFn);
     WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
   } else { // Must be a co-executable VALU.
-    const int VALUWaitsLimit = 8; // Maximum of VALUWaitStates
+           // Maximum of VALUWaitStates.
+    const int VALUWaitsLimit = IsLowestRateWMMA ? 16 : 8;
     ExistingVALUs =
         getWaitStatesSince(IsVALUHazardFn, VALUWaitsLimit, GetWaitStatesFn);
     WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-coexecution-valu-hazards.mir b/llvm/test/CodeGen/AMDGPU/wmma-coexecution-valu-hazards.mir
index 1918d44b3f568..990d218f8cf1d 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-coexecution-valu-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-coexecution-valu-hazards.mir
@@ -1,5 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1251 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1251 %s
+
+# Same hazards as gfx1251
+# RUN: llc -mtriple=amdgcn -mcpu=gfx12-5-generic -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1251 %s
 
 # WMMA writes: D0, WMMA reads: A0/B0/Index0
 # VALU writes: D1, VALU reads: Use1
@@ -19,6 +23,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
 ...
@@ -34,6 +50,18 @@ body: |
     ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec
     ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec
     ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+    ; GFX1251-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+    ; GFX1251-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+    ; GFX1251-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr26 = V_MOV_B32_e32 26, implicit $exec
     $vgpr27 = V_MOV_B32_e32 27, implicit $exec
@@ -57,6 +85,22 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1251-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1251-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1251-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
@@ -76,6 +120,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
 ...
@@ -91,6 +147,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr16, $vgpr255 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_vopd_vdstX
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr16, $vgpr255 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr16, $vgpr255 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
 ...
@@ -106,6 +174,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
 ...
@@ -121,6 +201,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr255, $vgpr0 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_A0_overlaps_vopd_vdstY
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr255, $vgpr0 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr255, $vgpr0 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
 ...
@@ -136,6 +228,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
 ...
@@ -151,6 +255,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
 ...
@@ -166,6 +282,18 @@ body: |
     ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec
     ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec
     ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_valus_in_between
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+    ; GFX1251-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+    ; GFX1251-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+    ; GFX1251-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr26 = V_MOV_B32_e32 26, implicit $exec
     $vgpr27 = V_MOV_B32_e32 27, implicit $exec
@@ -189,6 +317,22 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_salus_in_between
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1251-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1251-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1251-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
@@ -208,6 +352,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
 ...
@@ -223,6 +379,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_A0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
 ...
@@ -238,6 +406,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_B0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
 ...
@@ -253,6 +433,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
     $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
 ...
@@ -268,6 +468,26 @@ body: |
     ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
     ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
     ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_valus_in_between
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1251-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1251-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    ; GFX1251-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
     $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     $vgpr41 = V_MOV_B32_e32 41, implicit $exec
@@ -291,6 +511,30 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_salus_in_between
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1251-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1251-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1251-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
@@ -310,6 +554,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
     $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
 ...
@@ -325,6 +589,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_A0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
 ...
@@ -340,6 +624,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_B0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
     $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
 ...
@@ -359,6 +663,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
 ...
@@ -378,6 +702,26 @@ body: |
     ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
     ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1251-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1251-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    ; GFX1251-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    ; GFX1251-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+    ; GFX1251-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+    ; GFX1251-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+    ; GFX1251-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     $vgpr41 = V_MOV_B32_e32 41, implicit $exec
@@ -413,6 +757,34 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1251-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1251-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1251-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1251-NEXT: $sgpr4 = S_MOV_B32 4
+    ; GFX1251-NEXT: $sgpr5 = S_MOV_B32 5
+    ; GFX1251-NEXT: $sgpr6 = S_MOV_B32 6
+    ; GFX1251-NEXT: $sgpr7 = S_MOV_B32 7
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
@@ -440,6 +812,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
 ...
@@ -459,6 +851,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
 ...
@@ -478,6 +890,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
 ...
@@ -497,6 +929,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
     $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
 ...
@@ -516,6 +968,26 @@ body: |
     ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
     ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
     ; GFX1250-NEXT: $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_valus_in_between
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1251-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1251-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    ; GFX1251-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    ; GFX1251-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+    ; GFX1251-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+    ; GFX1251-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+    ; GFX1251-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
     $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     $vgpr41 = V_MOV_B32_e32 41, implicit $exec
@@ -551,6 +1023,34 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_salus_in_between
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1251-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1251-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1251-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1251-NEXT: $sgpr4 = S_MOV_B32 4
+    ; GFX1251-NEXT: $sgpr5 = S_MOV_B32 5
+    ; GFX1251-NEXT: $sgpr6 = S_MOV_B32 6
+    ; GFX1251-NEXT: $sgpr7 = S_MOV_B32 7
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
@@ -578,6 +1078,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
     $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
 ...
@@ -597,6 +1117,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_A0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
 ...
@@ -616,6 +1156,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_B0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
     $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
 ...
@@ -629,6 +1189,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
     $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
 ...
@@ -642,6 +1214,18 @@ body: |
     ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
     ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_valus_in_between
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1251-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
     $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     $vgpr41 = V_MOV_B32_e32 41, implicit $exec
@@ -659,6 +1243,20 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_salus_in_between
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1251-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
@@ -674,6 +1272,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
 ...
@@ -687,6 +1297,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_A0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
 ...
@@ -700,6 +1322,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_B0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
     $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
 ...
@@ -713,6 +1347,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_Index0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
     $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
 ...
@@ -726,6 +1372,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
     $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
 ...
@@ -739,6 +1397,18 @@ body: |
     ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
     ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_valus_in_between
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1251-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
     $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     $vgpr41 = V_MOV_B32_e32 41, implicit $exec
@@ -756,6 +1426,20 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_salus_in_between
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1251-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
@@ -771,6 +1455,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
     $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
 ...
@@ -784,6 +1480,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_A0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
 ...
@@ -797,6 +1505,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_B0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
     $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
 ...
@@ -810,6 +1530,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_Index0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
     $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
 ...
@@ -825,6 +1557,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec
 ...
@@ -840,6 +1592,26 @@ body: |
     ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
     ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
     ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1251-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1251-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    ; GFX1251-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     $vgpr41 = V_MOV_B32_e32 41, implicit $exec
@@ -863,6 +1635,30 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1251-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1251-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1251-NEXT: $sgpr3 = S_MOV_B32 4
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
@@ -882,6 +1678,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
 ...
@@ -897,6 +1713,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
 ...
@@ -912,6 +1748,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
 ...
@@ -927,6 +1783,26 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
 ...
@@ -942,6 +1818,18 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_EXP_F32_e32 $vgpr34, implicit $mode, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_trans_B0_overlaps_D1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: $vgpr8 = V_EXP_F32_e32 $vgpr34, implicit $mode, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
     $vgpr8 = V_EXP_F32_e32 $vgpr34, implicit $mode, implicit $exec
 ...
@@ -950,9 +1838,16 @@ body: |
 name: test_wmma_tdm_load_D0_overlaps_Use1
 body: |
   bb.0:
+    ; GFX125X-LABEL: name: test_wmma_tdm_load_D0_overlaps_Use1
+    ; GFX125X: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
+    ; GFX125X-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR killed $sgpr0_sgpr1, killed $vgpr24, killed $vgpr0, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
     ; GFX1250-LABEL: name: test_wmma_tdm_load_D0_overlaps_Use1
     ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR killed $sgpr0_sgpr1, killed $vgpr24, killed $vgpr0, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+    ;
+    ; GFX1251-LABEL: name: test_wmma_tdm_load_D0_overlaps_Use1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR killed $sgpr0_sgpr1, killed $vgpr24, killed $vgpr0, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
     GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR killed $sgpr0_sgpr1, killed $vgpr24, killed $vgpr0, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
index 81196564740a9..306ea07a19ad5 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
@@ -1,5 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1251 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1251 %s
+
+# Same hazards as gfx1251
+# RUN: llc -mtriple=amdgcn -mcpu=gfx12-5-generic -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1251 %s
 
 # For two conscutive wmma instructions, we need to insert one V_NOP instruction between
 # them if matrix A, B or index of the second wmma are the same or overlap with previous
@@ -13,6 +17,11 @@ body: |
     ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr4_vgpr5, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr4_vgpr5, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
     $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr4_vgpr5, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
 ...
@@ -25,6 +34,11 @@ body: |
     ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr14_vgpr15, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr14_vgpr15, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
     $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr14_vgpr15, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
 ...
@@ -37,6 +51,11 @@ body: |
     ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec
     $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec
 ...
@@ -53,6 +72,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -69,6 +101,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -85,6 +130,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
 ...
@@ -101,6 +159,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -117,6 +188,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -133,6 +217,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
 ...
@@ -149,6 +246,19 @@ body:             |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -165,6 +275,19 @@ body:             |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -181,6 +304,19 @@ body:             |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec
     $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec
 ...
@@ -197,6 +333,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -213,6 +362,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -229,6 +391,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
 ...
@@ -245,6 +420,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -261,6 +449,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -277,6 +478,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
 ...
@@ -293,6 +507,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -309,6 +536,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -325,6 +565,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
 ...
@@ -341,6 +594,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -357,6 +623,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -373,6 +652,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
 ...
@@ -389,6 +681,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -405,6 +710,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -421,6 +739,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
 ...
@@ -437,6 +768,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -453,6 +797,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -469,6 +826,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
 ...
@@ -485,6 +855,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -501,6 +884,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_bf8_fp8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -517,6 +913,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
 ...
@@ -533,6 +942,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -549,6 +971,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -565,6 +1000,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
 ...
@@ -585,6 +1033,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -605,6 +1074,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -625,6 +1115,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
 ...
@@ -641,6 +1152,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -657,6 +1181,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
@@ -673,6 +1210,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
     $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
 ...
@@ -689,6 +1239,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -705,6 +1268,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
     $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
     $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
 ...
@@ -721,6 +1297,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
     $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
 ...
@@ -741,6 +1330,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -761,6 +1371,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -781,6 +1412,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
 ...
@@ -797,6 +1449,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -813,6 +1486,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -829,6 +1523,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
 ...
@@ -849,6 +1564,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -869,6 +1605,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -889,6 +1646,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr81, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr81, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
 ...
@@ -905,6 +1683,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -921,6 +1720,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -937,6 +1757,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr81, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr81, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
 ...
@@ -957,6 +1798,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale16_F3216_16x16x128_F8F6F4_F8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -977,6 +1839,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale16_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -997,6 +1880,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale16_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
 ...
@@ -1013,6 +1917,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale16_F3216_16x16x128_F8F6F4_F6f4_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -1029,6 +1954,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale16_F32_16x16x128_F8F6F4_F6f4_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
 ...
@@ -1045,6 +1991,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_wmma_scale16_F32_16x16x128_F8F6F4_F6f4_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
 ...
@@ -1059,6 +2026,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1073,6 +2053,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1087,6 +2080,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1101,6 +2107,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1115,6 +2134,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1129,6 +2161,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1143,6 +2188,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1157,6 +2215,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1171,6 +2242,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1185,6 +2269,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1199,6 +2296,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1213,6 +2323,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
@@ -1227,6 +2350,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1241,6 +2377,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1255,6 +2404,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
@@ -1269,6 +2431,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1283,6 +2458,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1297,6 +2485,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
@@ -1311,6 +2512,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1325,6 +2539,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1339,6 +2566,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
@@ -1353,6 +2593,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1367,6 +2620,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
 
     $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
@@ -1382,6 +2648,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
@@ -1396,6 +2675,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1410,6 +2702,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
 
     $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
@@ -1425,6 +2730,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
@@ -1439,6 +2757,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1453,6 +2784,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1467,6 +2811,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
@@ -1481,6 +2838,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1495,6 +2865,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
 ...
@@ -1509,6 +2892,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
@@ -1525,6 +2921,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1541,6 +2958,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
     $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1557,6 +2995,27 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
 ...
@@ -1571,6 +3030,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1585,6 +3057,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1599,6 +3084,19 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1613,6 +3111,19 @@ body:             |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1627,6 +3138,19 @@ body:             |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -1641,6 +3165,19 @@ body:             |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    ;
+    ; GFX1251-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
     $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
     $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1251-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1251-w32.mir
new file mode 100644
index 0000000000000..64a568b7fe983
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1251-w32.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1251 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1251 %s
+
+# For two conscutive wmma instructions, we need to insert one V_NOP instruction between
+# them if matrix A, B or index of the second wmma are the same or overlap with previous
+# wmma instruction’s D-matrix.
+
+---
+name: test_wmma_f64_16x16x4_f64_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1251-LABEL: name: test_wmma_f64_16x16x4_f64_D0_overlaps_A1
+    ; GFX1251: early-clobber $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f64_16x16x4_f64_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1251-LABEL: name: test_wmma_f64_16x16x4_f64_D0_overlaps_B1
+    ; GFX1251: early-clobber $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f64_16x16x4_f64_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1251-LABEL: name: test_wmma_f64_16x16x4_f64_D0_overlaps_Index1
+    ; GFX1251: early-clobber $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1251-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1251-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F64_16X16X4_F64_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...



More information about the llvm-branch-commits mailing list