[llvm] [AMDGPU] Use larger immediate values in S_NOP (PR #158990)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 16 04:13:44 PDT 2025


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/158990

The S_NOP instruction has an immediate operand which is one less than
the number of cycles to delay for. The maximum value that may be encoded
in this field was increased in GFX8 and again in GFX12.


>From f2015192a1a414d66c80cff89e337ff23d3592e6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 16 Sep 2025 12:05:30 +0100
Subject: [PATCH 1/2] Regenerate checks. NFC.

---
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 3684 ++++++++---------
 1 file changed, 1813 insertions(+), 1871 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index beda16c17a5c9..7fd27f5314833 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -31,26 +31,26 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i3
 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
 
 define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -73,47 +73,26 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -135,7 +114,26 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -319,26 +317,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -361,47 +359,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -423,7 +400,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -452,26 +448,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -494,47 +490,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -556,7 +531,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -585,26 +579,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -627,47 +621,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -689,7 +662,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -718,26 +710,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -748,59 +740,38 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 5
-; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-GISEL-NEXT:    s_endpgm
-;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -822,7 +793,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -1471,46 +1461,85 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
 ; GFX942-AGPRCD:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -1533,47 +1562,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
 ; GFX942-AGPRCD-NEXT:    s_nop 5
 ; GFX942-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GFX942-AGPRCD-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
 ; GFX950-AGPRCD:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -1604,66 +1592,125 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1701,7 +1748,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1739,67 +1785,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1837,7 +1822,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1883,46 +1867,85 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
 ; GFX942-AGPRCD:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -1945,47 +1968,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
 ; GFX942-AGPRCD-NEXT:    s_nop 5
 ; GFX942-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GFX942-AGPRCD-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
 ; GFX950-AGPRCD:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -2016,66 +1998,125 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2113,7 +2154,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2151,67 +2191,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2249,7 +2228,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2295,53 +2273,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2367,7 +2391,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -2398,54 +2421,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2471,7 +2446,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -2510,73 +2484,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2618,7 +2658,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2661,74 +2700,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2770,7 +2741,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2821,53 +2791,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2893,7 +2909,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -2924,54 +2939,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2997,7 +2964,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3036,53 +3002,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3108,7 +3120,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3139,54 +3150,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3212,7 +3175,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3251,53 +3213,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3323,7 +3331,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3354,54 +3361,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3427,7 +3386,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3466,53 +3424,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3538,7 +3542,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3569,54 +3572,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3642,7 +3597,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3681,73 +3635,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3789,7 +3809,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3832,74 +3851,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3941,7 +3892,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3992,73 +3942,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4100,7 +4116,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4143,74 +4158,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4252,7 +4199,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4303,73 +4249,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4411,7 +4423,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4454,74 +4465,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4563,7 +4506,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4614,73 +4556,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4722,7 +4730,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4765,74 +4772,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4874,7 +4813,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4928,5 +4866,9 @@ attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX942: {{.*}}
 ; GFX942-VGPRCD: {{.*}}
+; GFX942-VGPRCD-GISEL: {{.*}}
+; GFX942-VGPRCD-SDAG: {{.*}}
 ; GFX950: {{.*}}
 ; GFX950-VGPRCD: {{.*}}
+; GFX950-VGPRCD-GISEL: {{.*}}
+; GFX950-VGPRCD-SDAG: {{.*}}

>From f772c314a2f9f3e9ff850704364eb767b99d696f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 16 Sep 2025 11:58:51 +0100
Subject: [PATCH 2/2] [AMDGPU] Use larger immediate values in S_NOP

The S_NOP instruction has an immediate operand which is one less than
the number of cycles to delay for. The maximum value that may be encoded
in this field was increased in GFX8 and again in GFX12.
---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       |   8 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   4 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   3 +-
 .../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll     |  24 +-
 llvm/test/CodeGen/AMDGPU/acc-ldst.ll          |   9 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |  15 +-
 .../CodeGen/AMDGPU/amdgpu-snop-padding.mir    |  33 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll   |  24 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 174 +++------
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 144 +++----
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll    |  12 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 108 ++----
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll     |  12 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  | 276 +++++--------
 ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 243 ++++--------
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 363 ++++++------------
 .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll    |  12 +-
 ...vm.amdgcn.sched.group.barrier.iterative.ll |  90 ++---
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll |  36 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 123 ++----
 .../CodeGen/AMDGPU/mai-hazards-gfx90a.mir     | 120 ++----
 .../CodeGen/AMDGPU/mai-hazards-gfx942.mir     | 358 ++++++-----------
 .../AMDGPU/mai-hazards-mfma-scale.gfx950.mir  |  27 +-
 llvm/test/CodeGen/AMDGPU/mai-hazards.mir      |  12 +-
 llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll    |  27 +-
 llvm/test/CodeGen/AMDGPU/mfma-loop.ll         |  99 ++---
 .../AMDGPU/mfma-no-register-aliasing.ll       |  60 +--
 .../AMDGPU/neighboring-mfma-padding.mir       |  33 +-
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll   |   6 +-
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |  24 +-
 llvm/test/CodeGen/AMDGPU/spill-agpr.ll        |   6 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |   3 +-
 32 files changed, 840 insertions(+), 1648 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 7b94ea3ffbf1f..771e13d37e5e2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -688,6 +688,14 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
   return NSAThreshold;
 }
 
+unsigned GCNSubtarget::getSNopBits() const {
+  if (getGeneration() >= AMDGPUSubtarget::GFX12)
+    return 7;
+  if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return 4;
+  return 3;
+}
+
 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
                                            const GCNSubtarget &ST)
     : ST(ST) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index cbd6f64976d21..00339db9d990e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1839,6 +1839,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns true if the subtarget requires a wait for xcnt before atomic
   /// flat/global stores & rmw.
   bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+
+  /// \returns the number of significant bits in the immediate field of the
+  /// S_NOP instruction.
+  unsigned getSNopBits() const;
 };
 
 class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5106478a95b43..ee3e8dc58b468 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1932,8 +1932,9 @@ void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               unsigned Quantity) const {
   DebugLoc DL = MBB.findDebugLoc(MI);
+  unsigned MaxSNopCount = 1u << ST.getSNopBits();
   while (Quantity > 0) {
-    unsigned Arg = std::min(Quantity, 8u);
+    unsigned Arg = std::min(Quantity, MaxSNopCount);
     Quantity -= Arg;
     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 393a462954003..5720b882f4e73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -58,8 +58,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
@@ -109,8 +108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 9
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -185,8 +183,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -220,8 +217,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 9
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
 bb:
@@ -277,8 +273,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
@@ -302,8 +297,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -336,8 +330,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -369,8 +362,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 002ccd6060681..635d2a2d16a76 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -9,8 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-NOT:     v_accvgpr_write
 ; GCN:         v_mfma_f32_32x32x1f32
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 2
 ; GCN-NOT:     v_accvgpr_read
 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
@@ -28,8 +27,7 @@ bb:
 ; GCN:      global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-NOT:  v_accvgpr_read
 ; GCN:      v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
 ; GCN-NEXT: s_nop 2
 ; GCN-NOT:  v_accvgpr_read
 ; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
@@ -80,8 +78,7 @@ bb:
 ; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-COUNT-32: v_accvgpr_write
 ; GCN:          v_mfma_f32_32x32x1f32
-; GCN-NEXT:     s_nop 7
-; GCN-NEXT:     s_nop 7
+; GCN-NEXT:     s_nop 15
 ; GCN-NEXT:     s_nop 2
 ; GCN-NOT:      v_accvgpr_read
 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index c226dae3d64a9..9e240238c1066 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -63,8 +63,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a16, v39
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@@ -181,8 +180,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a0
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -487,8 +485,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
 ; GFX90A-NEXT:    ; copy
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v35 ; Reload Reuse
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a2
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a3 v[0:31]
@@ -965,8 +962,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a16, v39
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@@ -1084,8 +1080,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v34, a32 ; Reload Reuse
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
index 22c913496b734..b5c3e3214f125 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
@@ -63,52 +63,41 @@ body:             |
   ; GCN16-NEXT:   successors: %bb.1(0x80000000)
   ; GCN16-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
   ; GCN16-NEXT: {{  $}}
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_BRANCH %bb.1
   ; GCN16-NEXT: {{  $}}
   ; GCN16-NEXT: bb.1:
   ; GCN16-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; GCN16-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
   ; GCN16-NEXT: {{  $}}
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
   ; GCN16-NEXT: {{  $}}
   ; GCN16-NEXT: bb.2:
   ; GCN16-NEXT:   successors: %bb.3(0x80000000)
   ; GCN16-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
   ; GCN16-NEXT: {{  $}}
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_NOP 0
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_BRANCH %bb.3
   ; GCN16-NEXT: {{  $}}
   ; GCN16-NEXT: bb.3:
   ; GCN16-NEXT:   liveins: $sgpr10_sgpr11
   ; GCN16-NEXT: {{  $}}
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   SI_RETURN
   bb.0:
     liveins: $sgpr6, $sgpr10_sgpr11
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 303ea50dc16cc..12a998ad82cd2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -87,8 +87,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -191,8 +190,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -256,8 +254,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a15
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a14
 ; GFX908-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -308,8 +305,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -424,8 +420,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a15
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -476,8 +471,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -513,8 +507,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -538,8 +531,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX90A-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index ff77d5ccbe312..5ab8706f28f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -59,8 +59,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
@@ -117,8 +116,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
@@ -175,8 +173,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v31, s15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 2
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
@@ -233,8 +230,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 2
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
@@ -283,8 +279,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@@ -319,8 +314,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 2
+; GFX942-NEXT:    s_nop 10
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@@ -347,8 +341,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 2
+; GFX90A-VGPR-NEXT:    s_nop 10
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
@@ -375,8 +368,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    s_nop 10
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
@@ -505,8 +497,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x8bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
@@ -542,8 +533,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 2
+; GFX942-NEXT:    s_nop 10
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@@ -570,8 +560,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 2
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
@@ -599,8 +588,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    s_nop 10
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
@@ -632,8 +620,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -671,8 +658,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 2
+; GFX90A-VGPR-NEXT:    s_nop 10
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
@@ -795,8 +781,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
@@ -823,8 +808,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
@@ -847,8 +831,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[8:9]
@@ -871,8 +854,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[8:9]
@@ -896,8 +878,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -914,8 +895,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 0
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -932,8 +912,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -950,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -975,8 +953,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -993,8 +970,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1011,8 +987,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1029,8 +1004,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1054,8 +1028,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 1.0
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1072,8 +1045,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 1.0
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1090,8 +1062,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1108,8 +1079,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1133,8 +1103,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1.0
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1151,8 +1120,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1.0
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1169,8 +1137,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1187,8 +1154,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1212,8 +1178,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 64
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1230,8 +1195,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 64
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1248,8 +1212,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1266,8 +1229,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1299,8 +1261,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1325,8 +1286,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1354,8 +1314,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1383,8 +1342,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1416,8 +1374,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1442,8 +1399,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1468,8 +1424,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1494,8 +1449,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1527,8 +1481,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1553,8 +1506,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1579,8 +1531,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1605,8 +1556,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1639,8 +1589,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1666,8 +1615,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1695,8 +1643,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1724,8 +1671,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1757,8 +1703,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1784,8 +1729,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1813,8 +1757,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1842,8 +1785,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index 7fd27f5314833..dc4c929124fec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -191,8 +191,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -228,8 +227,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -265,8 +263,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -302,8 +299,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -870,8 +866,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -907,8 +902,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -944,8 +938,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -981,8 +974,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1025,8 +1017,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1062,8 +1053,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1099,8 +1089,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1136,8 +1125,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1180,8 +1168,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1217,8 +1204,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1254,8 +1240,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1291,8 +1276,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1335,8 +1319,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1372,8 +1355,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1409,8 +1391,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1446,8 +1427,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1614,8 +1594,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1644,8 +1623,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
@@ -1674,8 +1652,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1704,8 +1681,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
@@ -2020,8 +1996,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -2050,8 +2025,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
@@ -2080,8 +2054,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -2110,8 +2083,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
@@ -2509,8 +2481,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -2543,8 +2514,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
@@ -2576,8 +2546,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -2610,8 +2579,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
@@ -3660,8 +3628,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -3694,8 +3661,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
@@ -3727,8 +3693,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -3761,8 +3726,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
@@ -3967,8 +3931,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4001,8 +3964,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
@@ -4034,8 +3996,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4068,8 +4029,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
@@ -4274,8 +4234,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4308,8 +4267,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
@@ -4341,8 +4299,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4375,8 +4332,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
@@ -4581,8 +4537,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4615,8 +4570,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
@@ -4648,8 +4602,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4682,8 +4635,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 284ced1727b7e..033a35f69a0bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -178,8 +178,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -223,8 +222,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -394,8 +392,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 10
 ; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -428,8 +425,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 10
 ; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 078a043b94604..753206206180a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -479,8 +479,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 8
 ; GISEL-NEXT:    global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@@ -598,8 +597,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; VGPRRC-NEXT:    v_mov_b32_e32 v50, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v51, s19
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    s_nop 8
 ; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
@@ -864,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 8
 ; GISEL-NEXT:    global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@@ -983,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; VGPRRC-NEXT:    v_mov_b32_e32 v50, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v51, s19
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    s_nop 8
 ; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
@@ -1169,8 +1165,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1210,8 +1205,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    s_nop 11
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1234,8 +1228,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23]
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
@@ -1342,8 +1335,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1383,8 +1375,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    s_nop 11
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1407,8 +1398,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
@@ -2199,8 +2189,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2228,8 +2217,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2257,8 +2245,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2286,8 +2273,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2384,8 +2370,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2413,8 +2398,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2442,8 +2426,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2471,8 +2454,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -3083,8 +3065,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 8
 ; GISEL-NEXT:    global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@@ -3205,8 +3186,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
@@ -3497,8 +3477,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 8
 ; GISEL-NEXT:    global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@@ -3619,8 +3598,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
@@ -3827,8 +3805,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3868,8 +3845,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    s_nop 11
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3892,8 +3868,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23]
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
@@ -4000,8 +3975,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4041,8 +4015,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    s_nop 11
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4065,8 +4038,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
@@ -4932,8 +4904,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4961,8 +4932,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -4995,8 +4965,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5029,8 +4998,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5142,8 +5110,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5171,8 +5138,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -5205,8 +5171,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5239,8 +5204,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
index 856185b17e5fd..d24f1f0b526c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
@@ -50,8 +50,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GFX908-NEXT:    v_accvgpr_read_b32 v14, a14
@@ -103,8 +102,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -138,8 +136,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -163,8 +160,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_i32_16x16x16i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX90A-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index c1508c1675fe0..7e30af96bb8b9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -97,8 +97,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2.0
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -233,8 +232,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2.0
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -337,8 +335,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -394,8 +391,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -451,8 +447,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
@@ -514,8 +509,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -582,8 +576,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -634,8 +627,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -669,8 +661,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -696,8 +687,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -872,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -940,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -992,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1028,8 +1015,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1056,8 +1042,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
@@ -1091,8 +1076,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, v5
 ; NOLIT-SRCC-NEXT:    s_nop 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1120,8 +1104,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, v5
 ; LIT-SRCC-NEXT:    s_nop 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1145,8 +1128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x4f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -1165,8 +1147,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_16x16x4_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 9
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX942-NEXT:    s_endpgm
 ;
@@ -1183,8 +1164,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
@@ -1275,8 +1255,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, s3
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -1415,8 +1394,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, s3
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -1523,8 +1501,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
@@ -1584,8 +1561,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
@@ -1645,8 +1621,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v37, s3
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 2
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[36:37] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[36:37] offset:112
@@ -1714,8 +1689,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v3, s23
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -1785,8 +1759,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v3, s23
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -1840,8 +1813,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1878,8 +1850,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 9
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1908,8 +1879,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -2108,8 +2078,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v3, s23
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -2179,8 +2148,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v3, s23
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -2234,8 +2202,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -2273,8 +2240,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 9
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -2303,8 +2269,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -2343,8 +2308,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v3, s7
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2375,8 +2339,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v3, s7
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2403,8 +2366,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s11
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -2536,8 +2498,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a26
@@ -2658,8 +2619,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a26
@@ -2748,8 +2708,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_i32_32x32x4i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -2805,8 +2764,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_i32_32x32x4_2b_i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -2862,8 +2820,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 2
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
@@ -2925,8 +2882,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
@@ -2993,8 +2949,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
@@ -3045,8 +3000,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -3080,8 +3034,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 9
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -3107,8 +3060,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -3145,8 +3097,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v16, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
@@ -3177,8 +3128,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v16, 0
 ; LIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
@@ -3211,8 +3161,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; GFX90A-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    s_nop 8
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -3228,8 +3177,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; GFX942-NEXT:    v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -3244,8 +3192,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -3645,8 +3592,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; NOLIT-SRCC-NEXT:    s_nop 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -3782,8 +3728,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; LIT-SRCC-NEXT:    s_nop 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -3887,8 +3832,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -3945,8 +3889,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -4003,8 +3946,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
@@ -4068,8 +4010,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -4136,8 +4077,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -4188,8 +4128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -4224,8 +4163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
 ; GFX942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -4252,8 +4190,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -4502,8 +4439,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -4541,8 +4477,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v8, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -4578,8 +4513,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    s_nop 8
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -4610,8 +4544,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4649,8 +4582,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -4691,8 +4623,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v13, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -4730,8 +4661,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -4750,8 +4680,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; GFX942-NEXT:    v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], 1.0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -4768,8 +4697,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4821,8 +4749,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a31
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a30
@@ -4889,8 +4816,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v14, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a31
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a30
@@ -4948,8 +4874,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@@ -4970,8 +4895,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -4990,8 +4914,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
@@ -5131,8 +5054,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -5186,8 +5108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -5242,8 +5163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -5274,8 +5194,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -5304,8 +5223,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5357,8 +5275,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a31
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a30
@@ -5457,8 +5374,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a31
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a30
@@ -5558,8 +5474,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@@ -5611,8 +5526,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@@ -5679,8 +5593,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
@@ -5965,8 +5878,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -6061,8 +5973,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -6125,8 +6036,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; GFX90A-NEXT:    global_load_dwordx4 a[0:3], v0, s[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
@@ -6156,8 +6066,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; GFX942-NEXT:    global_load_dwordx4 a[0:3], v0, s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
@@ -6187,8 +6096,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; GFX942-VGPR-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; GFX942-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 17ae6dd23b199..aae14c8cc87b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -23,8 +23,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -47,8 +46,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -71,8 +69,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -95,8 +92,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -119,8 +115,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -143,8 +138,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -167,8 +161,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -191,8 +184,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -216,8 +208,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -241,8 +232,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -266,8 +256,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -291,8 +280,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -316,8 +304,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -341,8 +328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -366,8 +352,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -391,8 +376,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -416,8 +400,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -441,8 +424,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -466,8 +448,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -491,8 +472,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -517,8 +497,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -542,8 +521,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -566,8 +544,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -591,8 +568,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -616,8 +592,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -641,8 +616,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -666,8 +640,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -691,8 +664,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -716,8 +688,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -741,8 +712,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -766,8 +736,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -888,8 +857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -913,8 +881,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -938,8 +905,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -963,8 +929,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1180,8 +1145,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1205,8 +1169,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1230,8 +1193,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1255,8 +1217,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1429,8 +1390,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
 ; GCN-NEXT:    v_mov_b32_e32 v17, s1
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1451,8 +1411,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_
 ; GCN-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1473,8 +1432,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_
 ; GCN-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1512,8 +1470,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1543,8 +1500,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v1
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1573,8 +1529,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1599,8 +1554,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1629,8 +1583,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1655,8 +1608,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1685,8 +1637,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1711,8 +1662,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1733,8 +1683,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
 ; GCN-NEXT:    v_mov_b32_e32 v17, s16
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1763,8 +1712,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s24
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1789,8 +1737,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s24
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1812,8 +1759,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1831,8 +1777,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
 ; GISEL-NEXT:    v_mov_b32_e32 v17, -2
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1854,8 +1799,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1873,8 +1817,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_mov_b32_e32 v17, -2
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1896,8 +1839,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1915,8 +1857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_mov_b32_e32 v17, 0x4d
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1958,8 +1899,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; SDAG-NEXT:    v_mov_b32_e32 v22, s13
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[14:15]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1983,8 +1923,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[30:31]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
@@ -2022,8 +1961,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -2048,8 +1986,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
@@ -2087,8 +2024,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -2113,8 +2049,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
@@ -2152,8 +2087,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -2178,8 +2112,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
@@ -2217,8 +2150,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -2243,8 +2175,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
@@ -2263,8 +2194,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2285,8 +2215,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2308,8 +2237,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2327,8 +2255,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
 ; GISEL-NEXT:    v_mov_b32_e32 v17, 1
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2350,8 +2277,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2369,8 +2295,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
 ; GISEL-NEXT:    v_mov_b32_e32 v17, 0
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2394,8 +2319,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2418,8 +2342,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2488,8 +2411,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2512,8 +2434,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2536,8 +2457,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2560,8 +2480,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 839f0324227ca..f0205a3a788ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -38,8 +38,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -83,8 +82,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -135,8 +133,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -180,8 +177,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -232,8 +228,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -277,8 +272,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -329,8 +323,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -374,8 +367,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -426,8 +418,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -471,8 +462,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -523,8 +513,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -568,8 +557,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -620,8 +608,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -665,8 +652,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -717,8 +703,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -762,8 +747,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -813,8 +797,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -866,8 +849,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -911,8 +893,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -961,8 +942,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1013,8 +993,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1062,8 +1041,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1114,8 +1092,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1163,8 +1140,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1213,8 +1189,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1262,8 +1237,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1315,8 +1289,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1360,8 +1333,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1410,8 +1382,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1463,8 +1434,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1508,8 +1478,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1559,8 +1528,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1611,8 +1579,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1660,8 +1627,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1712,8 +1678,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1761,8 +1726,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1811,8 +1775,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1860,8 +1823,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1912,8 +1874,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1961,8 +1922,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2013,8 +1973,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2062,8 +2021,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2112,8 +2070,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2160,8 +2117,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2209,8 +2165,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2257,8 +2212,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2309,8 +2263,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2358,8 +2311,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2410,8 +2362,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2459,8 +2410,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2509,8 +2459,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2557,8 +2506,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2606,8 +2554,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2654,8 +2601,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2703,8 +2649,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2751,8 +2696,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2800,8 +2744,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2848,8 +2791,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2897,8 +2839,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2946,8 +2887,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2996,8 +2936,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3045,8 +2984,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3095,8 +3033,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3143,8 +3080,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3192,8 +3128,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3240,8 +3175,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3289,8 +3223,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3337,8 +3270,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3392,8 +3324,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3441,8 +3372,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3490,8 +3420,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3554,8 +3483,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3610,8 +3538,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3665,8 +3592,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3716,8 +3642,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3771,8 +3696,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3822,8 +3746,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3877,8 +3800,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3928,8 +3850,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3975,8 +3896,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
 ; GCN-NEXT:    v_mov_b32_e32 v17, s28
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4039,8 +3959,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4099,8 +4018,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4148,8 +4066,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4193,8 +4110,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4242,8 +4158,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4287,8 +4202,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4336,8 +4250,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4381,8 +4294,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4430,8 +4342,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4475,8 +4386,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4524,8 +4434,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4569,8 +4478,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4618,8 +4526,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4663,8 +4570,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4723,8 +4629,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
@@ -4759,8 +4664,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
@@ -4808,8 +4712,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -4844,8 +4747,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5191,8 +5093,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 14
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s22
@@ -5440,8 +5341,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5487,8 +5387,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5536,8 +5435,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5581,8 +5479,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5630,8 +5527,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5675,8 +5571,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5728,8 +5623,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5773,8 +5667,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5825,8 +5718,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5870,8 +5762,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5922,8 +5813,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5966,8 +5856,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6015,8 +5904,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6066,8 +5954,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -6111,8 +5998,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -6163,8 +6049,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -6208,8 +6093,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -6259,8 +6143,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -6310,8 +6193,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -6362,8 +6244,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6406,8 +6287,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6455,8 +6335,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 198cac5834d1f..5475fa2ae5c6e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -133,8 +133,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -172,8 +171,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -239,8 +236,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
index af26e7adae713..bc72687e260e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
@@ -28,8 +28,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    v_add_u32_e32 v3, 0x6000, v4
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    ds_write_b128 v5, a[28:31] offset:112
 ; GCN-MINREG-NEXT:    ds_write_b128 v5, a[24:27] offset:96
 ; GCN-MINREG-NEXT:    ds_write_b128 v5, a[20:23] offset:80
@@ -51,8 +50,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
@@ -75,8 +73,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
@@ -99,8 +96,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
@@ -123,8 +119,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
@@ -159,8 +154,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    v_add_u32_e32 v1, s1, v1
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:112
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:96
@@ -184,8 +178,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:8288
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:8304
@@ -208,8 +201,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 2
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:16480
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:16496
@@ -233,8 +225,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:24672
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:24688
@@ -257,8 +248,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 2
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:32864
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:32880
@@ -293,8 +283,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:96
@@ -315,8 +304,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
 ; GCN-ILP-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
@@ -336,8 +324,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:24688
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 2
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
 ; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:49168
@@ -358,8 +345,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
 ; GCN-ILP-NEXT:    v_add_u32_e32 v3, 0x6000, v3
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[4:7] offset:24592
 ; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:57360
@@ -383,8 +369,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 2
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
@@ -488,8 +473,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    v_add_u32_e32 v2, s1, v2
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 1
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:112
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:96
@@ -513,8 +497,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 1
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:8288
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:8304
@@ -539,8 +522,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    v_add_u32_e32 v4, 0x6000, v3
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 1
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:16496
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:16480
@@ -563,8 +545,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:24688
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:24672
@@ -587,8 +568,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:32880
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:32864
@@ -623,8 +603,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    v_add_u32_e32 v3, s1, v3
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:112
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:96
@@ -648,8 +627,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:8288
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:8304
@@ -673,8 +651,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 2
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:16496
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:16480
@@ -698,8 +675,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:24688
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:24672
@@ -722,8 +698,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 2
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:32880
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:32864
@@ -758,8 +733,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    v_add_u32_e32 v2, s1, v2
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3]
 ; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:8192
@@ -783,8 +757,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:8288
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:8304
@@ -808,8 +781,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 2
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:16496
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:16480
@@ -830,8 +802,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GCN-ILP-NEXT:    v_add_u32_e32 v3, 0x6000, v3
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3] offset:24576
 ; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:57344
@@ -855,8 +826,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 2
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:32880
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:32864
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 5b877f5a2bbb7..aa099b60ef16d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -678,8 +678,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 4
+; GCN-NEXT:    s_nop 12
 ; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:96
 ; GCN-NEXT:    ds_write_b128 v0, a[148:151] offset:80
@@ -785,8 +784,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 4
+; EXACTCUTOFF-NEXT:    s_nop 12
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[156:159] offset:112
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[152:155] offset:96
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[148:151] offset:80
@@ -890,8 +888,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:96
@@ -915,8 +912,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
@@ -939,8 +935,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
@@ -964,8 +959,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
@@ -988,8 +982,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
@@ -1024,8 +1017,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:96
@@ -1049,8 +1041,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
@@ -1073,8 +1064,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 2
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
@@ -1098,8 +1088,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
@@ -1122,8 +1111,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 2
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index b25fe8392a60e..6eb9449069a52 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -199,8 +199,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -232,8 +231,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
@@ -253,8 +251,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -316,8 +313,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -379,8 +375,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -471,8 +466,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -685,8 +679,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 10
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -706,8 +699,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_mov_b32_e32 v0, v12
 ; GCN-NEXT:    v_mov_b32_e32 v1, v13
 ; GCN-NEXT:    v_mov_b32_e32 v2, v14
@@ -734,8 +726,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_mov_b32_e32 v0, v12
 ; GCN-NEXT:    v_mov_b32_e32 v1, v13
 ; GCN-NEXT:    v_mov_b32_e32 v2, v14
@@ -762,8 +753,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_mov_b32_e32 v0, v12
 ; GCN-NEXT:    v_mov_b32_e32 v1, v13
 ; GCN-NEXT:    v_mov_b32_e32 v2, v14
@@ -819,8 +809,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v27, v9
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_mov_b32_e32 v0, v12
 ; GCN-NEXT:    v_mov_b32_e32 v1, v13
 ; GCN-NEXT:    v_mov_b32_e32 v2, v14
@@ -1049,8 +1038,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -1082,8 +1070,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -1103,8 +1090,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -1166,8 +1152,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -1229,8 +1214,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -1321,8 +1305,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2098,8 +2081,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -2131,8 +2113,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2152,8 +2133,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2215,8 +2195,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2278,8 +2257,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2370,8 +2348,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2471,8 +2448,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -2504,8 +2480,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2525,8 +2500,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2588,8 +2562,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2651,8 +2624,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2743,8 +2715,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2844,8 +2815,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -2877,8 +2847,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2898,8 +2867,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2961,8 +2929,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3024,8 +2991,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3116,8 +3082,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3217,8 +3182,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -3250,8 +3214,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -3271,8 +3234,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3334,8 +3296,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3397,8 +3358,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3489,8 +3449,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
index 3feccff715bc1..ddd8a4784ea86 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -125,8 +125,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_agpr_mfma_read_overlap
 body:             |
@@ -136,8 +135,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_vgpr_mfma_read_overlap
 body:             |
@@ -147,8 +145,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_read_overlap
 body:             |
@@ -196,8 +193,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: V_MFMA
 name:            sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
 body:             |
@@ -207,8 +203,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
@@ -249,8 +244,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -260,8 +254,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
@@ -312,8 +305,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
 body:             |
@@ -333,8 +325,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
 body:             |
@@ -384,8 +375,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
 body:             |
@@ -435,8 +425,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            smfma16x16_write_vgpr_flat_read
 body:             |
@@ -446,8 +435,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            smfma32x32_write_vgpr_flat_read
@@ -458,8 +446,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_vgpr_flat_read_overlap
 body:             |
@@ -469,8 +456,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_vgpr_flat_read_full
 body:             |
@@ -480,8 +466,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_vgpr_flat_read
@@ -502,8 +487,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            smfma16x16_write_vgpr_valu_read
 body:             |
@@ -513,8 +497,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_MOV_B32
 name:            smfma32x32_write_vgpr_valu_read
@@ -535,8 +518,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_read
 body:             |
@@ -556,8 +538,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_accv_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            smfma16x16_write_vgpr_accv_read
 body:             |
@@ -567,8 +548,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_accv_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            smfma32x32_write_vgpr_accv_read
@@ -599,8 +579,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_DOT
 name:            dmfma16x16_write_vgpr_dot_read
 body:             |
@@ -620,8 +599,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            smfma16x16_write_vgpr_valu_write
 body:             |
@@ -631,8 +609,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_MOV_B32
 name:            smfma32x32_write_vgpr_valu_write
@@ -653,8 +630,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_valu_f16_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_FMA_F16_e64
 name:            smfma16x16_write_vgpr_valu_f16_write
 body:             |
@@ -664,8 +640,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_valu_f16_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_FMA_F16_e64
 name:            smfma32x32_write_vgpr_valu_f16_write
@@ -686,8 +661,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_valu_sdwa_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32_sdwa
 name:            smfma16x16_write_vgpr_valu_sdwa_write
 body:             |
@@ -697,8 +671,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_valu_sdwa_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_MOV_B32_sdwa
 name:            smfma32x32_write_vgpr_valu_sdwa_write
@@ -719,8 +692,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_write
 body:             |
@@ -770,8 +742,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_read_srcc_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 6
+# GCN-NEXT: S_NOP 14
 # GCN-NEXT: V_MOV_B32
 name:            smfma32x32_read_srcc_vgpr_valu_write
 body:             |
@@ -1040,8 +1011,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_read_overlap
 body:             |
@@ -1080,8 +1050,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: V_MFMA
 name:            sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
 body:             |
@@ -1091,8 +1060,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
@@ -1133,8 +1101,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -1154,8 +1121,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
 body:             |
@@ -1185,8 +1151,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
 body:             |
@@ -1196,8 +1161,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_agpr_flat_read_overlap
 body:             |
@@ -1207,8 +1171,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_agpr_flat_read_full
 body:             |
@@ -1218,8 +1181,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_agpr_flat_read
@@ -1240,8 +1202,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_ACCVGPR_READ_B32_e64
 name:            dmfma16x16_write_agpr_valu_read
 body:             |
@@ -1261,8 +1222,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            dmfma16x16_write_agpr_valu_write
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir
index 8f4f57a5d37c5..1ef6b4c844c93 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir
@@ -178,11 +178,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
 body:             |
@@ -192,11 +189,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
 body:             |
@@ -225,11 +219,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_SMFMAC
 name:            xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
 body:             |
@@ -239,8 +230,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -252,8 +242,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -274,8 +263,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: V_MFMA
 name:            nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
 body:             |
@@ -285,8 +273,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_smfmac_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_SMFMAC
@@ -298,11 +285,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_read_overlap
@@ -323,11 +307,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
@@ -358,9 +339,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
 body:             |
@@ -370,8 +350,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -383,9 +362,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
 body:             |
@@ -395,9 +373,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
 body:             |
@@ -417,9 +394,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -429,8 +405,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -440,9 +415,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name:            smfmac32x32_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -452,9 +426,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_SMFMAC
 name:            smfmac32x32_write_agpr_smfmac_srcc_read_overlap
 body:             |
@@ -464,8 +437,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -477,8 +449,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
 name:            nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
@@ -539,11 +510,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
@@ -564,11 +532,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
@@ -639,11 +604,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
@@ -654,11 +616,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_SMFMAC
 name:            dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
@@ -669,11 +628,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcc_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 
 # GCN-NEXT: V_SMFMAC
@@ -746,9 +702,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            xdl_smfma16x16_write_vgpr_flat_read
 body:             |
@@ -758,9 +713,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32_write_vgpr_flat_read
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            smfmac32x32_write_vgpr_flat_read
 body:             |
@@ -770,8 +724,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: FLAT_STORE_DWORD
@@ -783,8 +736,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_vgpr_flat_read_overlap
 body:             |
@@ -794,8 +746,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_vgpr_flat_read_full
 body:             |
@@ -805,8 +756,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_vgpr_flat_read
@@ -827,9 +777,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MOV_B32
 name:            xdl_smfma16x16_write_vgpr_valu_read
 body:             |
@@ -839,8 +788,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32
@@ -862,11 +810,8 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_read
@@ -887,9 +832,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_accv_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            xdl_smfma16x16_write_vgpr_accv_read
 body:             |
@@ -899,8 +843,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_accv_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
@@ -932,11 +875,8 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 
 # GCN-NEXT: V_DOT
@@ -958,9 +898,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MOV_B32
 name:            xdl_smfma16x16_write_vgpr_valu_write
 body:             |
@@ -970,8 +909,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32
@@ -993,9 +931,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_f16_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_FMA_F16_e64
 name:            xdl_smfma16x16_write_vgpr_valu_f16_write
 body:             |
@@ -1005,8 +942,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_f16_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_FMA_F16_e64
@@ -1028,9 +964,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_sdwa_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MOV_B32_sdwa
 name:            xdl_smfma16x16_write_vgpr_valu_sdwa_write
 body:             |
@@ -1040,8 +975,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_sdwa_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32_sdwa
@@ -1063,8 +997,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_write
 body:             |
@@ -1379,11 +1312,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_read_overlap
@@ -1404,11 +1334,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 0
 
 # GCN-NEXT: V_MFMA
@@ -1430,9 +1357,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
 body:             |
@@ -1442,8 +1368,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -1485,11 +1410,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
@@ -1510,11 +1432,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 
 # GCN-NEXT: V_MFMA
@@ -1546,11 +1465,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
@@ -1561,8 +1477,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_agpr_flat_read_overlap
 body:             |
@@ -1572,8 +1487,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_agpr_flat_read_full
 body:             |
@@ -1583,8 +1497,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_agpr_flat_read
@@ -1605,11 +1518,8 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_ACCVGPR_READ_B32_e64
 name:            dmfma16x16_write_agpr_valu_read
@@ -1630,8 +1540,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            dmfma16x16_write_agpr_valu_write
 body:             |
@@ -1840,9 +1749,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_SMFMAC
 name:            smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
 body:             |
@@ -1959,8 +1867,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_vm_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: BUFFER_STORE_DWORD
 name:            nonxdl_8pass_smfma16x16_write_vgpr_vm_read
 body:             |
@@ -1970,8 +1877,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MOV_B32
 name:            nonxdl_8pass_smfma16x16_write_vgpr_valu_read
 body:             |
@@ -1981,8 +1887,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MOV_B32
 name:            nonxdl_8pass_smfma16x16_write_vgpr_valu_write
 body:             |
@@ -1992,8 +1897,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_vm_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: BUFFER_STORE_DWORD
 name:            nonxdl_smfma32x32_write_vgpr_vm_read
@@ -2004,8 +1908,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_MOV_B32
 name:            nonxdl_smfma32x32_write_vgpr_valu_read
@@ -2016,8 +1919,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_MOV_B32
 name:            nonxdl_smfma32x32_write_vgpr_valu_write
@@ -2109,9 +2011,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32_read_vgpr_srcc_valu_write
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MOV_B32
 name:            smfmac32x32_read_vgpr_srcc_valu_write
 body:             |
@@ -2121,8 +2022,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 6
+# GCN-NEXT: S_NOP 14
 # GCN-NEXT: V_MOV_B32
 name:            xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
 body:             |
@@ -2337,9 +2237,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
 body:             |
@@ -2353,9 +2252,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2369,9 +2267,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2385,8 +2282,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -2403,8 +2299,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -2420,8 +2315,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -2450,8 +2344,7 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2464,8 +2357,7 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2477,9 +2369,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc
 body:             |
@@ -2492,9 +2383,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
 body:             |
@@ -2507,9 +2397,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
 body:             |
@@ -2522,8 +2411,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -2539,8 +2427,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -2557,8 +2444,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -2603,9 +2489,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_SMFMAC_
 name:            xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc
 body:             |
@@ -2617,8 +2502,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_16pass_mfma_write_agpr_smfmac_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_SMFMAC_
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
index c01c2be23b83f..7708c8fc00609 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
@@ -15,8 +15,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
@@ -37,8 +36,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec
@@ -59,8 +57,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec
@@ -81,8 +78,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec
@@ -163,8 +159,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 7
+    ; GCN-NEXT: S_NOP 15
     ; GCN-NEXT: S_NOP 1
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
@@ -186,8 +181,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
@@ -208,8 +202,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 7
+    ; GCN-NEXT: S_NOP 15
     ; GCN-NEXT: S_NOP 1
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
@@ -231,8 +224,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
@@ -253,8 +245,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 3
+    ; GCN-NEXT: S_NOP 11
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index ce67a2eec93bc..61f2629dded83 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -157,8 +157,7 @@ body:             |
 
 # GCN-LABEL: name: mfma_16x16_write_agpr_accvgpr_read
 # GCN:      V_MFMA_F32_16X16X1F32
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_ACCVGPR_READ_B32_e64
 name:            mfma_16x16_write_agpr_accvgpr_read
 body:             |
@@ -170,8 +169,7 @@ body:             |
 
 # GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_read
 # GCN:      V_MFMA_F32_32X32X2F32
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_ACCVGPR_READ_B32_e64
 name:            mfma_32x32_write_agpr_accvgpr_read
@@ -208,8 +206,7 @@ body:             |
 
 # GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_write
 # GCN:      V_MFMA_F32_32X32X2F32
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 6
+# GCN-NEXT: S_NOP 14
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            mfma_32x32_write_agpr_accvgpr_write
 body:             |
@@ -244,8 +241,7 @@ body:             |
 
 # GCN-LABEL: name: mfma_32x32_read_srcc_accvgpr_write
 # GCN:      V_MFMA_F32_32X32X2F32
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 4
+# GCN-NEXT: S_NOP 12
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            mfma_32x32_read_srcc_accvgpr_write
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index f7aaa3ec4d0ed..9585c486aeb9e 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -84,8 +84,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -227,8 +226,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -347,8 +345,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -454,8 +451,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -561,8 +557,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -690,8 +685,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -835,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(
 ; GFX908-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v6, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a26
@@ -977,8 +970,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v4, a26
@@ -1079,8 +1071,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v4, a26
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index d39daaade677f..3b8efafba06f4 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -54,8 +54,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB0_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -148,8 +147,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -288,8 +285,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -383,8 +379,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -444,8 +439,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -518,8 +512,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -612,8 +605,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -672,8 +664,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -783,8 +774,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -909,8 +899,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1001,8 +990,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1075,8 +1063,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -1170,8 +1157,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1231,8 +1217,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1344,8 +1329,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -1441,8 +1425,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1504,8 +1487,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1614,8 +1596,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -1712,8 +1693,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1776,8 +1756,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1856,8 +1835,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -1919,8 +1897,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1948,8 +1925,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2019,8 +1995,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_mov_b32 s0, 16
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX908-NEXT:    s_nop 1
@@ -2065,8 +2040,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -2118,8 +2092,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a0
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
@@ -2163,8 +2136,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2182,8 +2154,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_mov_b32 s0, 16
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
@@ -2227,8 +2198,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2349,8 +2319,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX908-NEXT:  ; %bb.4: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 2
+; GFX908-NEXT:    s_nop 10
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -2453,8 +2422,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2523,8 +2491,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index e6d7b14381d7a..51cd564bdece3 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -93,8 +93,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
 ; GREEDY908-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31]
-; GREEDY908-NEXT:    s_nop 7
-; GREEDY908-NEXT:    s_nop 7
+; GREEDY908-NEXT:    s_nop 15
 ; GREEDY908-NEXT:    s_nop 1
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a32
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v5, a61
@@ -158,8 +157,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a31, v5
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; GREEDY908-NEXT:    s_nop 7
-; GREEDY908-NEXT:    s_nop 7
+; GREEDY908-NEXT:    s_nop 15
 ; GREEDY908-NEXT:    s_nop 1
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -263,8 +261,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY90A-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
-; GREEDY90A-NEXT:    s_nop 7
-; GREEDY90A-NEXT:    s_nop 7
+; GREEDY90A-NEXT:    s_nop 15
 ; GREEDY90A-NEXT:    s_nop 2
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a2, a32
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a3, a33
@@ -298,8 +295,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a31, a61
 ; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GREEDY90A-NEXT:    s_nop 7
-; GREEDY90A-NEXT:    s_nop 7
+; GREEDY90A-NEXT:    s_nop 15
 ; GREEDY90A-NEXT:    s_nop 2
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
@@ -356,8 +352,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[32:63], v0, v1, a[0:31]
-; GREEDY942-NEXT:    s_nop 7
-; GREEDY942-NEXT:    s_nop 7
+; GREEDY942-NEXT:    s_nop 15
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a2, a32
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a3, a33
@@ -391,8 +386,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a31, a61
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
-; GREEDY942-NEXT:    s_nop 7
-; GREEDY942-NEXT:    s_nop 7
+; GREEDY942-NEXT:    s_nop 15
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
@@ -448,8 +442,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
-; GREEDY90A-GISEL-NEXT:    s_nop 7
-; GREEDY90A-GISEL-NEXT:    s_nop 7
+; GREEDY90A-GISEL-NEXT:    s_nop 15
 ; GREEDY90A-GISEL-NEXT:    s_nop 2
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a2, a32
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a3, a33
@@ -484,8 +477,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY90A-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GREEDY90A-GISEL-NEXT:    s_nop 7
-; GREEDY90A-GISEL-NEXT:    s_nop 7
+; GREEDY90A-GISEL-NEXT:    s_nop 15
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
@@ -542,8 +534,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
 ; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63]
-; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 7
+; FAST90A-NEXT:    s_nop 15
 ; FAST90A-NEXT:    s_nop 2
 ; FAST90A-NEXT:    v_accvgpr_read_b32 v3, a29
 ; FAST90A-NEXT:    v_accvgpr_read_b32 v4, a28
@@ -609,8 +600,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    v_accvgpr_write_b32 a31, v3
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 7
+; FAST90A-NEXT:    s_nop 15
 ; FAST90A-NEXT:    s_nop 2
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
@@ -676,8 +666,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    s_nop 1
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
-; GREEDY908-NEXT:    s_nop 7
-; GREEDY908-NEXT:    s_nop 0
+; GREEDY908-NEXT:    s_nop 8
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a19
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a18
 ; GREEDY908-NEXT:    s_nop 0
@@ -685,8 +674,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a0, v3
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GREEDY908-NEXT:    s_nop 7
-; GREEDY908-NEXT:    s_nop 1
+; GREEDY908-NEXT:    s_nop 9
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a15
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a14
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -744,14 +732,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
 ; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
-; GREEDY90A-NEXT:    s_nop 7
-; GREEDY90A-NEXT:    s_nop 1
+; GREEDY90A-NEXT:    s_nop 9
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a0, a18
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a1, a19
 ; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GREEDY90A-NEXT:    s_nop 7
-; GREEDY90A-NEXT:    s_nop 2
+; GREEDY90A-NEXT:    s_nop 10
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
@@ -786,14 +772,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
 ; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
-; GREEDY942-NEXT:    s_nop 7
-; GREEDY942-NEXT:    s_nop 0
+; GREEDY942-NEXT:    s_nop 8
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a0, a18
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a1, a19
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
-; GREEDY942-NEXT:    s_nop 7
-; GREEDY942-NEXT:    s_nop 1
+; GREEDY942-NEXT:    s_nop 9
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
@@ -827,8 +811,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
-; GREEDY90A-GISEL-NEXT:    s_nop 7
-; GREEDY90A-GISEL-NEXT:    s_nop 2
+; GREEDY90A-GISEL-NEXT:    s_nop 10
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a2, a16
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a3, a17
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a4, a18
@@ -846,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GREEDY90A-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GREEDY90A-GISEL-NEXT:    s_nop 7
-; GREEDY90A-GISEL-NEXT:    s_nop 1
+; GREEDY90A-GISEL-NEXT:    s_nop 9
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -882,8 +864,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
 ; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15]
-; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 2
+; FAST90A-NEXT:    s_nop 10
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a2, a16
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a3, a17
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a4, a18
@@ -900,8 +881,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a15, a29
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
-; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 2
+; FAST90A-NEXT:    s_nop 10
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
index df3dd7292b7f8..4d1a663aace42 100644
--- a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
@@ -372,14 +372,12 @@ body: |
     ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; gfx908-PAD75-NEXT: S_NOP 7
-    ; gfx908-PAD75-NEXT: S_NOP 3
+    ; gfx908-PAD75-NEXT: S_NOP 11
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; gfx908-PAD100-NEXT: S_NOP 7
-    ; gfx908-PAD100-NEXT: S_NOP 7
+    ; gfx908-PAD100-NEXT: S_NOP 15
     ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass
@@ -393,8 +391,7 @@ body: |
     ;
     ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass
     ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; gfx90a-PAD100-NEXT: S_NOP 7
-    ; gfx90a-PAD100-NEXT: S_NOP 7
+    ; gfx90a-PAD100-NEXT: S_NOP 15
     ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass
@@ -408,8 +405,7 @@ body: |
     ;
     ; gfx942-PAD100-LABEL: name: mfma_padding_16_pass
     ; gfx942-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; gfx942-PAD100-NEXT: S_NOP 7
-    ; gfx942-PAD100-NEXT: S_NOP 7
+    ; gfx942-PAD100-NEXT: S_NOP 15
     ; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
@@ -459,8 +455,7 @@ body: |
     ; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; gfx908-PAD100-NEXT: S_NOP 7
-    ; gfx908-PAD100-NEXT: S_NOP 3
+    ; gfx908-PAD100-NEXT: S_NOP 11
     ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
@@ -486,8 +481,7 @@ body: |
     ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; gfx90a-PAD100-NEXT: S_NOP 7
-    ; gfx90a-PAD100-NEXT: S_NOP 3
+    ; gfx90a-PAD100-NEXT: S_NOP 11
     ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
@@ -513,8 +507,7 @@ body: |
     ; gfx942-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx942-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx942-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; gfx942-PAD100-NEXT: S_NOP 7
-    ; gfx942-PAD100-NEXT: S_NOP 3
+    ; gfx942-PAD100-NEXT: S_NOP 11
     ; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -887,8 +880,7 @@ body: |
   ; gfx908-PAD75-NEXT: {{  $}}
   ; gfx908-PAD75-NEXT: bb.2:
   ; gfx908-PAD75-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-  ; gfx908-PAD75-NEXT:   S_NOP 7
-  ; gfx908-PAD75-NEXT:   S_NOP 1
+  ; gfx908-PAD75-NEXT:   S_NOP 9
   ; gfx908-PAD75-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
   ;
   ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_2_preds
@@ -905,8 +897,7 @@ body: |
   ; gfx908-PAD100-NEXT: {{  $}}
   ; gfx908-PAD100-NEXT: bb.2:
   ; gfx908-PAD100-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-  ; gfx908-PAD100-NEXT:   S_NOP 7
-  ; gfx908-PAD100-NEXT:   S_NOP 5
+  ; gfx908-PAD100-NEXT:   S_NOP 13
   ; gfx908-PAD100-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
   ;
   ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
@@ -956,8 +947,7 @@ body: |
   ; gfx90a-PAD100-NEXT: {{  $}}
   ; gfx90a-PAD100-NEXT: bb.2:
   ; gfx90a-PAD100-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-  ; gfx90a-PAD100-NEXT:   S_NOP 7
-  ; gfx90a-PAD100-NEXT:   S_NOP 5
+  ; gfx90a-PAD100-NEXT:   S_NOP 13
   ; gfx90a-PAD100-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
   ;
   ; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
@@ -1007,8 +997,7 @@ body: |
   ; gfx942-PAD100-NEXT: {{  $}}
   ; gfx942-PAD100-NEXT: bb.2:
   ; gfx942-PAD100-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-  ; gfx942-PAD100-NEXT:   S_NOP 7
-  ; gfx942-PAD100-NEXT:   S_NOP 5
+  ; gfx942-PAD100-NEXT:   S_NOP 13
   ; gfx942-PAD100-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
   bb.0:
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
index cefcd7e0d2651..fc154604b8700 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
@@ -33,8 +33,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
 ; CHECK-NEXT:  .LBB0_3: ; %if
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    global_load_dwordx4 a[28:31], v32, s[0:1] offset:112
 ; CHECK-NEXT:    global_load_dwordx4 a[24:27], v32, s[0:1] offset:96
 ; CHECK-NEXT:    global_load_dwordx4 a[20:23], v32, s[0:1] offset:80
@@ -98,8 +97,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace
 ; CHECK-NEXT:  .LBB1_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    v_mov_b64_e32 v[62:63], v[30:31]
 ; CHECK-NEXT:    v_mov_b64_e32 v[60:61], v[28:29]
 ; CHECK-NEXT:    v_mov_b64_e32 v[58:59], v[26:27]
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 5f42abbeae253..b9e9893ede4e2 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -60,8 +60,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31]
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v33
@@ -96,8 +95,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
 ; CHECK-NEXT:    v_mov_b32_e32 v32, 0
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31]
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
@@ -143,8 +141,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mov_b32_e32 v32, 0
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
@@ -178,8 +175,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2(
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mov_b32_e32 v32, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
@@ -212,8 +208,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2(
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mov_b32_e32 v32, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
@@ -351,8 +346,7 @@ define void @test_rewrite_mfma_subreg_extract2(float %arg0, float %arg1, ptr add
 ; CHECK-NEXT:    global_load_dwordx4 a[0:3], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    v_accvgpr_mov_b32 a0, a1
 ; CHECK-NEXT:    v_accvgpr_mov_b32 a1, a2
@@ -717,8 +711,7 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_chain(p
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; CHECK-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@@ -777,8 +770,7 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_nop 8
 ; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 2040e2b26cb15..290d9c5401154 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -258,8 +258,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
 ; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v4, v4, a[0:31]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_write_b32 a1, v5
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ;;#ASMEND
@@ -339,8 +338,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v2, a[0:31]
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index d8264b5a091e1..b045c761436de 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -958,8 +958,7 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[40:41] offset:96



More information about the llvm-commits mailing list