[llvm] 0b6db77 - [AMDGPU] Handle AV classes in SIFixSGPRCopies::processPHINode (#169038)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 21 07:18:00 PST 2025


Author: Jay Foad
Date: 2025-11-21T15:17:55Z
New Revision: 0b6db777ba9821bc17b969ddf6fefee54519c4f4

URL: https://github.com/llvm/llvm-project/commit/0b6db777ba9821bc17b969ddf6fefee54519c4f4
DIFF: https://github.com/llvm/llvm-project/commit/0b6db777ba9821bc17b969ddf6fefee54519c4f4.diff

LOG: [AMDGPU] Handle AV classes in SIFixSGPRCopies::processPHINode (#169038)

Fix a problem exposed by #166483 using AV classes in more places.
`isVectorRegister` only accepts registers of VGPR or AGPR classes.
`hasVectorRegisters` additionally accepts the combined AV classes.

Fixes: #168761

Added: 
    llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll

Modified: 
    llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
    llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
    llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
    llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
    llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
    llvm/test/CodeGen/AMDGPU/mfma-loop.ll
    llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
    llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index e1647b76702c4..39a6a7762eea5 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -856,8 +856,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
     }
   }
 
-  if (TRI->isVectorRegister(*MRI, PHIRes) ||
-       RC0 == &AMDGPU::VReg_1RegClass) {
+  if (TRI->hasVectorRegisters(MRI->getRegClass(PHIRes)) ||
+      RC0 == &AMDGPU::VReg_1RegClass) {
     LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
     TII->legalizeOperands(MI, MDT);
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 5cceb918b755e..b8962fa29e8f1 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -926,12 +926,12 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -939,23 +939,23 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
+; GFX90A-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB14_2: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX90A-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB14_4: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(2)
@@ -1016,12 +1016,12 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1029,23 +1029,23 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
 ; GFX90A-NEXT:    s_cbranch_execz .LBB15_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
+; GFX90A-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB15_2: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX90A-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB15_4: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(2)
@@ -1294,12 +1294,12 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1307,23 +1307,23 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
 ; GFX90A-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
+; GFX90A-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB18_2: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX90A-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB18_4: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(2)
@@ -6406,35 +6406,35 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_add_i64_ret_av_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
+; GFX90A-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB90_2: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB90_4
 ; GFX90A-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX90A-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB90_4: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6591,35 +6591,35 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_sub_i64_ret_av_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
+; GFX90A-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB92_2: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB92_4
 ; GFX90A-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, v0, v4
+; GFX90A-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX90A-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB92_4: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -8881,28 +8881,28 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[4:5]
+; GFX90A-NEXT:    ; def v[6:7]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB114_4
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
+; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:  .LBB114_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v6
+; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
@@ -8911,20 +8911,20 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB114_2
 ; GFX90A-NEXT:  ; %bb.3: ; %Flow
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX90A-NEXT:  .LBB114_4: ; %Flow3
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB114_6
 ; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
 ; GFX90A-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(1)
-; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, v0, v4
+; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, v0, v6
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX90A-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v7, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
 ; GFX90A-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
@@ -9027,29 +9027,28 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX90A-NEXT:    ; implicit-def: $agpr0
+; GFX90A-NEXT:    ; implicit-def: $vgpr3
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB115_3
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
-; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
-; GFX90A-NEXT:    ; implicit-def: $vgpr2
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
 ; GFX90A-NEXT:  .LBB115_3: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB115_5
 ; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v2, v1, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v1
-; GFX90A-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX90A-NEXT:  .LBB115_5: ; %Flow1
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v3
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2
 ; GFX90A-NEXT:  .LBB115_6: ; %Flow2
@@ -9066,7 +9065,6 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a0
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a:
@@ -9829,33 +9827,31 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB127_3
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
-; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX90A-NEXT:  .LBB127_3: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB127_5
 ; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX90A-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_add_f64 v[0:1], v[4:5], v[2:3]
+; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB127_5: ; %Flow1
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX90A-NEXT:  .LBB127_6: ; %Flow2
@@ -9873,7 +9869,6 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a:
@@ -9895,32 +9890,30 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-NEXT:  ; %bb.1: ; %atomicrmw.check.private
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s3, v1
-; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX950-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX950-NEXT:    s_cbranch_execz .LBB127_3
 ; GFX950-NEXT:  ; %bb.2: ; %atomicrmw.global
-; GFX950-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
-; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT:    global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off sc0
 ; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-NEXT:  .LBB127_3: ; %Flow
 ; GFX950-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
 ; GFX950-NEXT:    s_cbranch_execz .LBB127_5
 ; GFX950-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX950-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-NEXT:    scratch_load_dwordx2 v[4:5], v6, off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; GFX950-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT:    scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT:    v_add_f64 v[0:1], v[4:5], v[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 v6, v[0:1], off
 ; GFX950-NEXT:  .LBB127_5: ; %Flow1
 ; GFX950-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v4
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v5
 ; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-NEXT:  .LBB127_6: ; %Flow2
@@ -9939,7 +9932,6 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; use a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
   %data = call double asm "; def $0", "=a"()
@@ -10407,31 +10399,31 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_fmax_f64_ret_av_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB132_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB132_2: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB132_4
 ; GFX90A-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
 ; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
@@ -10595,31 +10587,31 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_fmin_f64_ret_av_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB134_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB134_2: ; %Flow
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB134_4
 ; GFX90A-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
 ; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX90A-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
@@ -14438,30 +14430,30 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB194_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB194_3
 ; GFX90A-NEXT:    s_branch .LBB194_4
 ; GFX90A-NEXT:  .LBB194_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB194_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB194_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14475,27 +14467,27 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; def v[0:1]
+; GFX950-NEXT:    ; def v[2:3]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_cbranch_vccz .LBB194_2
 ; GFX950-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT:    flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_cbranch_execz .LBB194_3
 ; GFX950-NEXT:    s_branch .LBB194_4
 ; GFX950-NEXT:  .LBB194_2:
-; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:  .LBB194_3: ; %atomicrmw.private
 ; GFX950-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX950-NEXT:    s_cselect_b32 s0, s0, -1
-; GFX950-NEXT:    scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT:    scratch_load_dwordx2 v[0:1], off, s0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[2:3], s0
 ; GFX950-NEXT:  .LBB194_4: ; %atomicrmw.end
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; use v[2:3]
+; GFX950-NEXT:    ; use v[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
@@ -14612,32 +14604,32 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB196_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB196_3
 ; GFX90A-NEXT:    s_branch .LBB196_4
 ; GFX90A-NEXT:  .LBB196_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB196_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB196_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14651,28 +14643,28 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; def v[0:1]
+; GFX950-NEXT:    ; def v[2:3]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_cbranch_vccz .LBB196_2
 ; GFX950-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT:    flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_cbranch_execz .LBB196_3
 ; GFX950-NEXT:    s_branch .LBB196_4
 ; GFX950-NEXT:  .LBB196_2:
-; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:  .LBB196_3: ; %atomicrmw.private
 ; GFX950-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX950-NEXT:    s_cselect_b32 s0, s0, -1
-; GFX950-NEXT:    scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT:    scratch_load_dwordx2 v[0:1], off, s0
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
-; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[2:3], s0
 ; GFX950-NEXT:  .LBB196_4: ; %atomicrmw.end
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; use v[2:3]
+; GFX950-NEXT:    ; use v[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
@@ -14791,32 +14783,32 @@ define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB198_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB198_3
 ; GFX90A-NEXT:    s_branch .LBB198_4
 ; GFX90A-NEXT:  .LBB198_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB198_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v0
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB198_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14971,32 +14963,32 @@ define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB200_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB200_3
 ; GFX90A-NEXT:    s_branch .LBB200_4
 ; GFX90A-NEXT:  .LBB200_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB200_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_and_b32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_and_b32_e32 v3, v1, v3
+; GFX90A-NEXT:    v_and_b32_e32 v2, v0, v2
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB200_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15400,32 +15392,32 @@ define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB204_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB204_3
 ; GFX90A-NEXT:    s_branch .LBB204_4
 ; GFX90A-NEXT:  .LBB204_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB204_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX90A-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB204_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15579,32 +15571,32 @@ define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB206_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB206_3
 ; GFX90A-NEXT:    s_branch .LBB206_4
 ; GFX90A-NEXT:  .LBB206_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB206_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_xor_b32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_xor_b32_e32 v0, v2, v0
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_xor_b32_e32 v3, v1, v3
+; GFX90A-NEXT:    v_xor_b32_e32 v2, v0, v2
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB206_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15761,33 +15753,33 @@ define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB208_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB208_3
 ; GFX90A-NEXT:    s_branch .LBB208_4
 ; GFX90A-NEXT:  .LBB208_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB208_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB208_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15946,33 +15938,33 @@ define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB210_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB210_3
 ; GFX90A-NEXT:    s_branch .LBB210_4
 ; GFX90A-NEXT:  .LBB210_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB210_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB210_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16131,33 +16123,33 @@ define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB212_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB212_3
 ; GFX90A-NEXT:    s_branch .LBB212_4
 ; GFX90A-NEXT:  .LBB212_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB212_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB212_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16316,33 +16308,33 @@ define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB214_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB214_3
 ; GFX90A-NEXT:    s_branch .LBB214_4
 ; GFX90A-NEXT:  .LBB214_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB214_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB214_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16697,37 +16689,37 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB218_2
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_execz .LBB218_3
 ; GFX90A-NEXT:    s_branch .LBB218_4
 ; GFX90A-NEXT:  .LBB218_2:
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:  .LBB218_3: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, -1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, -1, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_or_b64 vcc, vcc, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB218_4: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use v[2:3]
+; GFX90A-NEXT:    ; use v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -17318,49 +17310,51 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_cbranch_vccz .LBB223_3
+; GFX90A-NEXT:    s_cbranch_vccz .LBB223_4
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_vccz .LBB223_4
+; GFX90A-NEXT:    s_cbranch_vccz .LBB223_7
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX90A-NEXT:    global_atomic_add_f32 v1, v1, v0, s[4:5] glc
+; GFX90A-NEXT:    s_cbranch_execz .LBB223_8
+; GFX90A-NEXT:  ; %bb.3:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v1
 ; GFX90A-NEXT:    s_cbranch_execz .LBB223_5
 ; GFX90A-NEXT:    s_branch .LBB223_6
-; GFX90A-NEXT:  .LBB223_3:
-; GFX90A-NEXT:    ; implicit-def: $agpr0
-; GFX90A-NEXT:    s_branch .LBB223_7
 ; GFX90A-NEXT:  .LBB223_4:
 ; GFX90A-NEXT:    ; implicit-def: $agpr0
-; GFX90A-NEXT:  .LBB223_5: ; %atomicrmw.private
-; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX90A-NEXT:    s_cselect_b32 s6, s4, -1
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v3, v2, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX90A-NEXT:  .LBB223_6: ; %Flow1
-; GFX90A-NEXT:    s_cbranch_execnz .LBB223_8
-; GFX90A-NEXT:  .LBB223_7: ; %atomicrmw.shared
+; GFX90A-NEXT:  .LBB223_5: ; %atomicrmw.shared
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NEXT:    ds_add_rtn_f32 v0, v1, v0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT:  .LBB223_8: ; %atomicrmw.end
+; GFX90A-NEXT:  .LBB223_6: ; %atomicrmw.end
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a0
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB223_7:
+; GFX90A-NEXT:    ; implicit-def: $vgpr1
+; GFX90A-NEXT:  .LBB223_8: ; %atomicrmw.private
+; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX90A-NEXT:    s_cselect_b32 s6, s4, -1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_f32_e32 v3, v2, v0
+; GFX90A-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT:    s_cbranch_execz .LBB223_5
+; GFX90A-NEXT:    s_branch .LBB223_6
 ;
 ; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a:
 ; GFX950:       ; %bb.0:
@@ -18168,16 +18162,13 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX90A-NEXT:    s_cbranch_execz .LBB235_5
 ; GFX90A-NEXT:    s_branch .LBB235_6
 ; GFX90A-NEXT:  .LBB235_3:
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX90A-NEXT:    s_branch .LBB235_7
 ; GFX90A-NEXT:  .LBB235_4:
-; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX90A-NEXT:  .LBB235_5: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s6, s4, -1
@@ -18185,12 +18176,13 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
 ; GFX90A-NEXT:    v_add_f64 v[4:5], v[2:3], v[0:1]
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX90A-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB235_6: ; %Flow1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB235_8
 ; GFX90A-NEXT:  .LBB235_7: ; %atomicrmw.shared
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
@@ -18204,7 +18196,6 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a:
@@ -18231,26 +18222,24 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:  ; %bb.2: ; %atomicrmw.global
 ; GFX950-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX950-NEXT:    global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX950-NEXT:    s_cbranch_execz .LBB235_5
 ; GFX950-NEXT:    s_branch .LBB235_6
 ; GFX950-NEXT:  .LBB235_3:
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_branch .LBB235_7
 ; GFX950-NEXT:  .LBB235_4:
-; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-NEXT:  .LBB235_5: ; %atomicrmw.private
 ; GFX950-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX950-NEXT:    s_cselect_b32 s2, s0, -1
 ; GFX950-NEXT:    scratch_load_dwordx2 v[2:3], off, s2
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_accvgpr_write_b32 a0, v2
 ; GFX950-NEXT:    v_add_f64 v[4:5], v[2:3], v[0:1]
-; GFX950-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX950-NEXT:    scratch_store_dwordx2 off, v[4:5], s2
 ; GFX950-NEXT:  .LBB235_6: ; %Flow1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX950-NEXT:    s_cbranch_execnz .LBB235_8
 ; GFX950-NEXT:  .LBB235_7: ; %atomicrmw.shared
 ; GFX950-NEXT:    s_cmp_lg_u64 s[0:1], 0
@@ -18264,7 +18253,6 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; use a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
   %data = call double asm "; def $0", "=a"()
@@ -18760,30 +18748,30 @@ define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; def v[0:1]
+; GFX950-NEXT:    ; def v[2:3]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_cbranch_vccz .LBB240_2
 ; GFX950-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT:    flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_cbranch_execz .LBB240_3
 ; GFX950-NEXT:    s_branch .LBB240_4
 ; GFX950-NEXT:  .LBB240_2:
-; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:  .LBB240_3: ; %atomicrmw.private
 ; GFX950-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX950-NEXT:    s_cselect_b32 s0, s0, -1
-; GFX950-NEXT:    scratch_load_dwordx2 v[2:3], off, s0
-; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-NEXT:    scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX950-NEXT:    v_max_f64 v[0:1], v[4:5], v[0:1]
-; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX950-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[2:3], s0
 ; GFX950-NEXT:  .LBB240_4: ; %atomicrmw.end
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; use v[2:3]
+; GFX950-NEXT:    ; use v[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
@@ -18942,30 +18930,30 @@ define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; def v[0:1]
+; GFX950-NEXT:    ; def v[2:3]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_cbranch_vccz .LBB242_2
 ; GFX950-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT:    flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_cbranch_execz .LBB242_3
 ; GFX950-NEXT:    s_branch .LBB242_4
 ; GFX950-NEXT:  .LBB242_2:
-; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:  .LBB242_3: ; %atomicrmw.private
 ; GFX950-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX950-NEXT:    s_cselect_b32 s0, s0, -1
-; GFX950-NEXT:    scratch_load_dwordx2 v[2:3], off, s0
-; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-NEXT:    scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX950-NEXT:    v_min_f64 v[0:1], v[4:5], v[0:1]
-; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX950-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[2:3], s0
 ; GFX950-NEXT:  .LBB242_4: ; %atomicrmw.end
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; use v[2:3]
+; GFX950-NEXT:    ; use v[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10

diff  --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
index 42f76c4a10d2a..4bc6220b4d9a0 100644
--- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
@@ -48,16 +48,17 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:  .LBB0_1: ; %Flow9
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_17
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[24:25]
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_18
 ; CHECK-NEXT:  .LBB0_2: ; %._crit_edge1942.i.i.i3548
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
-; CHECK-NEXT:    ; Child Loop BB0_6 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_7 Depth 2
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_9
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_11
 ; CHECK-NEXT:  ; %bb.3: ; %.preheader1868.i.i.i3244
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 vcc, s[4:5]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_10
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_12
 ; CHECK-NEXT:  ; %bb.4: ; %.preheader1855.i.i.i3329.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
@@ -85,49 +86,54 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_fmac_f64_e32 v[26:27], 0, v[28:29]
 ; CHECK-NEXT:    v_mov_b64_e32 v[28:29], v[18:19]
 ; CHECK-NEXT:    v_fmac_f64_e32 v[28:29], 0, v[26:27]
-; CHECK-NEXT:    s_branch .LBB0_6
-; CHECK-NEXT:  .LBB0_5: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT:    s_branch .LBB0_7
+; CHECK-NEXT:  .LBB0_5: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:    s_mov_b64 s[24:25], -1
+; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1
+; CHECK-NEXT:    s_mov_b64 s[8:9], -1
+; CHECK-NEXT:  .LBB0_6: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_11
-; CHECK-NEXT:  .LBB0_6: ; %.preheader1855.i.i.i3329
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_13
+; CHECK-NEXT:  .LBB0_7: ; %.preheader1855.i.i.i3329
 ; CHECK-NEXT:    ; Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_accvgpr_read_b32 v27, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v26, a0
-; CHECK-NEXT:    s_mov_b64 s[24:25], -1
-; CHECK-NEXT:    s_mov_b64 s[8:9], -1
 ; CHECK-NEXT:    s_mov_b64 vcc, s[2:3]
-; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
-; CHECK-NEXT:  ; %bb.7: ; %.lr.ph2070.i.i.i3291
-; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v30
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, v31
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; CHECK-NEXT:  ; %bb.8: ; %.lr.ph2070.i.i.i3291
+; CHECK-NEXT:    ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    s_mov_b64 vcc, s[6:7]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
-; CHECK-NEXT:  ; %bb.8: ; %.preheader1856.preheader.i.i.i3325
-; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_10
+; CHECK-NEXT:  ; %bb.9: ; %.preheader1856.preheader.i.i.i3325
+; CHECK-NEXT:    ; in Loop: Header=BB0_7 Depth=2
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v28
 ; CHECK-NEXT:    s_mov_b64 s[24:25], 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v29
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:    s_branch .LBB0_5
-; CHECK-NEXT:  .LBB0_9: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    s_branch .LBB0_6
+; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_7 Depth=2
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v30
+; CHECK-NEXT:    s_mov_b64 s[24:25], -1
+; CHECK-NEXT:    v_accvgpr_write_b32 a1, v31
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; CHECK-NEXT:    s_branch .LBB0_6
+; CHECK-NEXT:  .LBB0_11: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[22:23], 0
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[20:21]
-; CHECK-NEXT:    s_branch .LBB0_15
-; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    s_branch .LBB0_16
+; CHECK-NEXT:  .LBB0_12: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
 ; CHECK-NEXT:    v_mov_b64_e32 v[22:23], 0
-; CHECK-NEXT:    s_branch .LBB0_15
-; CHECK-NEXT:  .LBB0_11: ; %loop.exit.guard
+; CHECK-NEXT:    v_mov_b64_e32 v[24:25], v[30:31]
+; CHECK-NEXT:    s_branch .LBB0_16
+; CHECK-NEXT:  .LBB0_13: ; %loop.exit.guard
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[24:25]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_13
-; CHECK-NEXT:  ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_15
+; CHECK-NEXT:  ; %bb.14: ; %._crit_edge2105.i.i.i2330.loopexit
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], 0, v[26:27]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v23, v23, 0, s[16:17]
@@ -139,24 +145,21 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_cselect_b32 s23, s23, 0
 ; CHECK-NEXT:    s_cselect_b32 s22, s22, 0
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
-; CHECK-NEXT:    s_branch .LBB0_14
-; CHECK-NEXT:  .LBB0_13: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    s_branch .LBB0_16
+; CHECK-NEXT:  .LBB0_15: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:    v_mov_b64_e32 v[22:23], 0
-; CHECK-NEXT:  .LBB0_14: ; %Flow6
-; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[24:25]
-; CHECK-NEXT:  .LBB0_15: ; %Flow6
+; CHECK-NEXT:  .LBB0_16: ; %Flow6
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[24:25], -1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_1
-; CHECK-NEXT:  ; %bb.16: ; %._crit_edge2105.i.i.i2330
+; CHECK-NEXT:  ; %bb.17: ; %._crit_edge2105.i.i.i2330
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[24:25], 0
 ; CHECK-NEXT:    global_store_dwordx2 v20, v[20:21], s[12:13]
 ; CHECK-NEXT:    s_branch .LBB0_1
-; CHECK-NEXT:  .LBB0_17: ; %DummyReturnBlock
+; CHECK-NEXT:  .LBB0_18: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_endpgm
 entry:
   br label %._crit_edge1942.i.i.i3548

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 36924175956cb..905c8e36dd692 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -467,7 +467,6 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -489,12 +488,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.40.Flow23:
   ; GFX90A-NEXT:   successors: %bb.38(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
@@ -509,7 +508,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.41.bb41:
   ; GFX90A-NEXT:   successors: %bb.46(0x40000000), %bb.42(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, $vcc, 0, implicit $exec
@@ -539,17 +538,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.42.Flow24:
   ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
@@ -561,8 +560,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr20, 16, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr66_sgpr67, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr18_sgpr19, implicit-def dead $scc
@@ -606,7 +605,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.46.bb48:
   ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.47(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -615,6 +614,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51)
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
@@ -646,7 +646,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
@@ -655,7 +655,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.48.bb63:
   ; GFX90A-NEXT:   successors: %bb.50(0x40000000), %bb.49(0x40000000)
-  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
+  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.50, implicit $vcc
@@ -669,7 +669,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.50.bb68:
   ; GFX90A-NEXT:   successors: %bb.54(0x40000000), %bb.51(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
@@ -698,7 +698,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.52.bb80:
   ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.53(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
@@ -712,7 +712,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
@@ -727,7 +727,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.54.bb73:
   ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76)
   ; GFX90A-NEXT:   renamable $vgpr8 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
@@ -759,9 +759,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.56.bb90:
   ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr30 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr30 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr66_sgpr67, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr7 = COPY renamable $sgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr7, 0, 0, implicit $exec :: (load (s64) from %ir.4, addrspace 3)
@@ -773,7 +773,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr7 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr16 = COPY renamable $vgpr22, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.60
   ; GFX90A-NEXT: {{  $}}
@@ -833,14 +833,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.59.bb85:
   ; GFX90A-NEXT:   successors: %bb.56(0x40000000), %bb.60(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr12 = V_OR_B32_e32 1, $vgpr10, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr13 = COPY renamable $vgpr11, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr12_vgpr13, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr3, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
@@ -855,20 +855,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.60.Flow31:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.61.Flow30:
   ; GFX90A-NEXT:   successors: %bb.55(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr52_sgpr53, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.55
   ; GFX90A-NEXT: {{  $}}

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll
new file mode 100644
index 0000000000000..b07294c71f608
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s
+
+; Check that the copy from s[2:3] to v[0:1] occurs inside the loop, not after it.
+
+define i64 @test_temporal_divergence(i32 %arg) #0 {
+; CHECK-LABEL: test_temporal_divergence:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_add_u32_e32 v2, 1, v0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:  .LBB0_1: ; %loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_u32_e32 v2, -1, v2
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; CHECK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT:    s_mov_b64 s[2:3], 1
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
+; CHECK-NEXT:  ; %bb.2: ; %end
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 1, %loop ], [ 0, %entry ]
+  %count = phi i32 [ %inc, %loop ], [ 0, %entry ]
+  %inc = add i32 %count, 1
+  %cond = icmp eq i32 %count, %arg
+  br i1 %cond, label %end, label %loop
+
+end:
+  ret i64 %i
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
index 053cf0e1c6906..789eb8e480214 100644
--- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
+++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
@@ -217,6 +217,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
@@ -236,7 +237,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
-; GFX942-NEXT:    global_load_dwordx4 v[16:19], v16, s[0:1]
+; GFX942-NEXT:    global_load_dwordx4 v[16:19], v0, s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v14, 16, v19

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 85bf05f39c684..8b6bb9b8c5fcd 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -101,54 +101,117 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX90A-LABEL: test_mfma_loop_zeroinit:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v10, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v11, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v12, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v13, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v14, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v15, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v16, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v17, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v18, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v20, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v21, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v22, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v23, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v24, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v25, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v26, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v27, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v28, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v29, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v30, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v31, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX90A-NEXT:  .LBB0_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB0_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -161,54 +224,117 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX942-LABEL: test_mfma_loop_zeroinit:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0
+; GFX942-NEXT:    v_mov_b32_e32 v31, 0
+; GFX942-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX942-NEXT:  .LBB0_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB0_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -334,54 +460,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ;
 ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x42f60000
+; GFX90A-NEXT:    v_mov_b32_e32 v31, 0x42f60000
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v6, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v8, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v10, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v11, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v12, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v13, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v14, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v15, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v16, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v17, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v18, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v19, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v20, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v21, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v22, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v23, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v24, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v25, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v26, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v27, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v28, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v29, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v30, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX90A-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -394,54 +583,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ;
 ; GFX942-LABEL: test_mfma_loop_unfoldable_splat:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0x42f60000
+; GFX942-NEXT:    v_mov_b32_e32 v31, 0x42f60000
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX942-NEXT:    v_mov_b32_e32 v0, v31
+; GFX942-NEXT:    v_mov_b32_e32 v1, v31
+; GFX942-NEXT:    v_mov_b32_e32 v2, v31
+; GFX942-NEXT:    v_mov_b32_e32 v3, v31
+; GFX942-NEXT:    v_mov_b32_e32 v4, v31
+; GFX942-NEXT:    v_mov_b32_e32 v5, v31
+; GFX942-NEXT:    v_mov_b32_e32 v6, v31
+; GFX942-NEXT:    v_mov_b32_e32 v7, v31
+; GFX942-NEXT:    v_mov_b32_e32 v8, v31
+; GFX942-NEXT:    v_mov_b32_e32 v9, v31
+; GFX942-NEXT:    v_mov_b32_e32 v10, v31
+; GFX942-NEXT:    v_mov_b32_e32 v11, v31
+; GFX942-NEXT:    v_mov_b32_e32 v12, v31
+; GFX942-NEXT:    v_mov_b32_e32 v13, v31
+; GFX942-NEXT:    v_mov_b32_e32 v14, v31
+; GFX942-NEXT:    v_mov_b32_e32 v15, v31
+; GFX942-NEXT:    v_mov_b32_e32 v16, v31
+; GFX942-NEXT:    v_mov_b32_e32 v17, v31
+; GFX942-NEXT:    v_mov_b32_e32 v18, v31
+; GFX942-NEXT:    v_mov_b32_e32 v19, v31
+; GFX942-NEXT:    v_mov_b32_e32 v20, v31
+; GFX942-NEXT:    v_mov_b32_e32 v21, v31
+; GFX942-NEXT:    v_mov_b32_e32 v22, v31
+; GFX942-NEXT:    v_mov_b32_e32 v23, v31
+; GFX942-NEXT:    v_mov_b32_e32 v24, v31
+; GFX942-NEXT:    v_mov_b32_e32 v25, v31
+; GFX942-NEXT:    v_mov_b32_e32 v26, v31
+; GFX942-NEXT:    v_mov_b32_e32 v27, v31
+; GFX942-NEXT:    v_mov_b32_e32 v28, v31
+; GFX942-NEXT:    v_mov_b32_e32 v29, v31
+; GFX942-NEXT:    v_mov_b32_e32 v30, v31
+; GFX942-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX942-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -561,54 +813,117 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX90A-LABEL: test_mfma_loop_non_splat:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v10, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v11, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v12, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v13, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v14, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v15, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v16, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v17, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v18, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v20, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v21, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v22, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v23, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v24, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v25, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v26, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v27, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v28, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v29, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v30, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v31, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX90A-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -621,54 +936,117 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX942-LABEL: test_mfma_loop_non_splat:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0
+; GFX942-NEXT:    v_mov_b32_e32 v31, 0
+; GFX942-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX942-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -825,85 +1203,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ;
 ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x431a0000
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x43190000
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x43180000
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x43170000
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x43160000
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0x43150000
-; GFX90A-NEXT:    v_mov_b32_e32 v6, 0x43140000
-; GFX90A-NEXT:    v_mov_b32_e32 v7, 0x43130000
-; GFX90A-NEXT:    v_mov_b32_e32 v8, 0x43120000
-; GFX90A-NEXT:    v_mov_b32_e32 v9, 0x43110000
-; GFX90A-NEXT:    v_mov_b32_e32 v10, 0x43100000
-; GFX90A-NEXT:    v_mov_b32_e32 v11, 0x430f0000
-; GFX90A-NEXT:    v_mov_b32_e32 v12, 0x430e0000
-; GFX90A-NEXT:    v_mov_b32_e32 v13, 0x430d0000
-; GFX90A-NEXT:    v_mov_b32_e32 v14, 0x430c0000
-; GFX90A-NEXT:    v_mov_b32_e32 v15, 0x430b0000
-; GFX90A-NEXT:    v_mov_b32_e32 v16, 0x430a0000
-; GFX90A-NEXT:    v_mov_b32_e32 v17, 0x43090000
-; GFX90A-NEXT:    v_mov_b32_e32 v18, 0x43080000
-; GFX90A-NEXT:    v_mov_b32_e32 v19, 0x43070000
-; GFX90A-NEXT:    v_mov_b32_e32 v20, 0x43060000
-; GFX90A-NEXT:    v_mov_b32_e32 v21, 0x43050000
-; GFX90A-NEXT:    v_mov_b32_e32 v22, 0x43040000
-; GFX90A-NEXT:    v_mov_b32_e32 v23, 0x43030000
-; GFX90A-NEXT:    v_mov_b32_e32 v24, 0x43020000
-; GFX90A-NEXT:    v_mov_b32_e32 v25, 0x43010000
-; GFX90A-NEXT:    v_mov_b32_e32 v26, 0x43000000
-; GFX90A-NEXT:    v_mov_b32_e32 v27, 0x42fe0000
-; GFX90A-NEXT:    v_mov_b32_e32 v28, 0x42fc0000
-; GFX90A-NEXT:    v_mov_b32_e32 v29, 0x42fa0000
-; GFX90A-NEXT:    v_mov_b32_e32 v30, 0x42f80000
-; GFX90A-NEXT:    v_mov_b32_e32 v31, 0x42f60000
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v31
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v30
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v29
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v28
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v27
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v26
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v25
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v24
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v23
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v22
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v21
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v20
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v19
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v18
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v17
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v16
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v15
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v14
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v13
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v12
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v11
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v10
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v9
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v8
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v7
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v6
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v5
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v4
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42f60000
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x42f80000
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x42fa0000
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x42fc0000
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x42fe0000
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0x43000000
+; GFX90A-NEXT:    v_mov_b32_e32 v6, 0x43010000
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0x43020000
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0x43030000
+; GFX90A-NEXT:    v_mov_b32_e32 v9, 0x43040000
+; GFX90A-NEXT:    v_mov_b32_e32 v10, 0x43050000
+; GFX90A-NEXT:    v_mov_b32_e32 v11, 0x43060000
+; GFX90A-NEXT:    v_mov_b32_e32 v12, 0x43070000
+; GFX90A-NEXT:    v_mov_b32_e32 v13, 0x43080000
+; GFX90A-NEXT:    v_mov_b32_e32 v14, 0x43090000
+; GFX90A-NEXT:    v_mov_b32_e32 v15, 0x430a0000
+; GFX90A-NEXT:    v_mov_b32_e32 v16, 0x430b0000
+; GFX90A-NEXT:    v_mov_b32_e32 v17, 0x430c0000
+; GFX90A-NEXT:    v_mov_b32_e32 v18, 0x430d0000
+; GFX90A-NEXT:    v_mov_b32_e32 v19, 0x430e0000
+; GFX90A-NEXT:    v_mov_b32_e32 v20, 0x430f0000
+; GFX90A-NEXT:    v_mov_b32_e32 v21, 0x43100000
+; GFX90A-NEXT:    v_mov_b32_e32 v22, 0x43110000
+; GFX90A-NEXT:    v_mov_b32_e32 v23, 0x43120000
+; GFX90A-NEXT:    v_mov_b32_e32 v24, 0x43130000
+; GFX90A-NEXT:    v_mov_b32_e32 v25, 0x43140000
+; GFX90A-NEXT:    v_mov_b32_e32 v26, 0x43150000
+; GFX90A-NEXT:    v_mov_b32_e32 v27, 0x43160000
+; GFX90A-NEXT:    v_mov_b32_e32 v28, 0x43170000
+; GFX90A-NEXT:    v_mov_b32_e32 v29, 0x43180000
+; GFX90A-NEXT:    v_mov_b32_e32 v30, 0x43190000
+; GFX90A-NEXT:    v_mov_b32_e32 v31, 0x431a0000
+; GFX90A-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX90A-NEXT:  .LBB3_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -916,85 +1326,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ;
 ; GFX942-LABEL: test_mfma_loop_unfoldable_seq:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x431a0000
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0x43190000
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0x43180000
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0x43170000
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0x43160000
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0x43150000
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0x43140000
-; GFX942-NEXT:    v_mov_b32_e32 v7, 0x43130000
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0x43120000
-; GFX942-NEXT:    v_mov_b32_e32 v9, 0x43110000
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0x43100000
-; GFX942-NEXT:    v_mov_b32_e32 v11, 0x430f0000
-; GFX942-NEXT:    v_mov_b32_e32 v12, 0x430e0000
-; GFX942-NEXT:    v_mov_b32_e32 v13, 0x430d0000
-; GFX942-NEXT:    v_mov_b32_e32 v14, 0x430c0000
-; GFX942-NEXT:    v_mov_b32_e32 v15, 0x430b0000
-; GFX942-NEXT:    v_mov_b32_e32 v16, 0x430a0000
-; GFX942-NEXT:    v_mov_b32_e32 v17, 0x43090000
-; GFX942-NEXT:    v_mov_b32_e32 v18, 0x43080000
-; GFX942-NEXT:    v_mov_b32_e32 v19, 0x43070000
-; GFX942-NEXT:    v_mov_b32_e32 v20, 0x43060000
-; GFX942-NEXT:    v_mov_b32_e32 v21, 0x43050000
-; GFX942-NEXT:    v_mov_b32_e32 v22, 0x43040000
-; GFX942-NEXT:    v_mov_b32_e32 v23, 0x43030000
-; GFX942-NEXT:    v_mov_b32_e32 v24, 0x43020000
-; GFX942-NEXT:    v_mov_b32_e32 v25, 0x43010000
-; GFX942-NEXT:    v_mov_b32_e32 v26, 0x43000000
-; GFX942-NEXT:    v_mov_b32_e32 v27, 0x42fe0000
-; GFX942-NEXT:    v_mov_b32_e32 v28, 0x42fc0000
-; GFX942-NEXT:    v_mov_b32_e32 v29, 0x42fa0000
-; GFX942-NEXT:    v_mov_b32_e32 v30, 0x42f80000
-; GFX942-NEXT:    v_mov_b32_e32 v31, 0x42f60000
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v31
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v30
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v29
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v28
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v27
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v26
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v25
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v24
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v23
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v22
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v21
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v20
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v19
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v18
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v17
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v16
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v15
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v14
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v13
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v12
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v11
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v10
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v9
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v8
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v7
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v6
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v5
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v4
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v1
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42f60000
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x42f80000
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x42fa0000
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x42fc0000
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x42fe0000
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x43000000
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0x43010000
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0x43020000
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0x43030000
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0x43040000
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0x43050000
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0x43060000
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0x43070000
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0x43080000
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0x43090000
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0x430a0000
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0x430b0000
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0x430c0000
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0x430d0000
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0x430e0000
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0x430f0000
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0x43100000
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0x43110000
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0x43120000
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0x43130000
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0x43140000
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0x43150000
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0x43160000
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0x43170000
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0x43180000
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0x43190000
+; GFX942-NEXT:    v_mov_b32_e32 v31, 0x431a0000
+; GFX942-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX942-NEXT:  .LBB3_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1114,54 +1556,117 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX90A-LABEL: test_mfma_loop_vgpr_init:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX90A-NEXT:    v_and_b32_e32 v31, 0x3ff, v0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v6, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v8, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v10, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v11, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v12, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v13, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v14, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v15, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v16, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v17, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v18, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v19, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v20, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v21, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v22, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v23, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v24, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v25, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v26, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v27, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v28, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v29, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v30, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX90A-NEXT:  .LBB4_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1174,54 +1679,117 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX942-LABEL: test_mfma_loop_vgpr_init:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942-NEXT:    v_and_b32_e32 v31, 0x3ff, v0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX942-NEXT:    v_mov_b32_e32 v0, v31
+; GFX942-NEXT:    v_mov_b32_e32 v1, v31
+; GFX942-NEXT:    v_mov_b32_e32 v2, v31
+; GFX942-NEXT:    v_mov_b32_e32 v3, v31
+; GFX942-NEXT:    v_mov_b32_e32 v4, v31
+; GFX942-NEXT:    v_mov_b32_e32 v5, v31
+; GFX942-NEXT:    v_mov_b32_e32 v6, v31
+; GFX942-NEXT:    v_mov_b32_e32 v7, v31
+; GFX942-NEXT:    v_mov_b32_e32 v8, v31
+; GFX942-NEXT:    v_mov_b32_e32 v9, v31
+; GFX942-NEXT:    v_mov_b32_e32 v10, v31
+; GFX942-NEXT:    v_mov_b32_e32 v11, v31
+; GFX942-NEXT:    v_mov_b32_e32 v12, v31
+; GFX942-NEXT:    v_mov_b32_e32 v13, v31
+; GFX942-NEXT:    v_mov_b32_e32 v14, v31
+; GFX942-NEXT:    v_mov_b32_e32 v15, v31
+; GFX942-NEXT:    v_mov_b32_e32 v16, v31
+; GFX942-NEXT:    v_mov_b32_e32 v17, v31
+; GFX942-NEXT:    v_mov_b32_e32 v18, v31
+; GFX942-NEXT:    v_mov_b32_e32 v19, v31
+; GFX942-NEXT:    v_mov_b32_e32 v20, v31
+; GFX942-NEXT:    v_mov_b32_e32 v21, v31
+; GFX942-NEXT:    v_mov_b32_e32 v22, v31
+; GFX942-NEXT:    v_mov_b32_e32 v23, v31
+; GFX942-NEXT:    v_mov_b32_e32 v24, v31
+; GFX942-NEXT:    v_mov_b32_e32 v25, v31
+; GFX942-NEXT:    v_mov_b32_e32 v26, v31
+; GFX942-NEXT:    v_mov_b32_e32 v27, v31
+; GFX942-NEXT:    v_mov_b32_e32 v28, v31
+; GFX942-NEXT:    v_mov_b32_e32 v29, v31
+; GFX942-NEXT:    v_mov_b32_e32 v30, v31
+; GFX942-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX942-NEXT:  .LBB4_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1382,53 +1950,117 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v31, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v7, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v8, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v9, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v10, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v11, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v12, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v13, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v14, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v15, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v16, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v17, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v18, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v19, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v20, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v21, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v22, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v23, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v24, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v25, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v26, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v27, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v28, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v29, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v30, s1
 ; GFX90A-NEXT:  .LBB5_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1443,53 +2075,117 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, s1
+; GFX942-NEXT:    v_mov_b32_e32 v31, s1
+; GFX942-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    v_mov_b32_e32 v6, s1
+; GFX942-NEXT:    v_mov_b32_e32 v7, s1
+; GFX942-NEXT:    v_mov_b32_e32 v8, s1
+; GFX942-NEXT:    v_mov_b32_e32 v9, s1
+; GFX942-NEXT:    v_mov_b32_e32 v10, s1
+; GFX942-NEXT:    v_mov_b32_e32 v11, s1
+; GFX942-NEXT:    v_mov_b32_e32 v12, s1
+; GFX942-NEXT:    v_mov_b32_e32 v13, s1
+; GFX942-NEXT:    v_mov_b32_e32 v14, s1
+; GFX942-NEXT:    v_mov_b32_e32 v15, s1
+; GFX942-NEXT:    v_mov_b32_e32 v16, s1
+; GFX942-NEXT:    v_mov_b32_e32 v17, s1
+; GFX942-NEXT:    v_mov_b32_e32 v18, s1
+; GFX942-NEXT:    v_mov_b32_e32 v19, s1
+; GFX942-NEXT:    v_mov_b32_e32 v20, s1
+; GFX942-NEXT:    v_mov_b32_e32 v21, s1
+; GFX942-NEXT:    v_mov_b32_e32 v22, s1
+; GFX942-NEXT:    v_mov_b32_e32 v23, s1
+; GFX942-NEXT:    v_mov_b32_e32 v24, s1
+; GFX942-NEXT:    v_mov_b32_e32 v25, s1
+; GFX942-NEXT:    v_mov_b32_e32 v26, s1
+; GFX942-NEXT:    v_mov_b32_e32 v27, s1
+; GFX942-NEXT:    v_mov_b32_e32 v28, s1
+; GFX942-NEXT:    v_mov_b32_e32 v29, s1
+; GFX942-NEXT:    v_mov_b32_e32 v30, s1
 ; GFX942-NEXT:  .LBB5_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1646,56 +2342,118 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX90A-LABEL: test_mfma_loop_mixed_init:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    s_load_dword s1, s[4:5], 0x2c
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v10, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v11, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v12, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v13, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v14, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v15, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v16, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v17, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v18, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v20, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v21, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v22, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v23, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v24, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v25, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v26, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v27, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v28, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v29, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v30, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v31, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX90A-NEXT:  .LBB6_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1709,56 +2467,118 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX942-LABEL: test_mfma_loop_mixed_init:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dword s1, s[4:5], 0x2c
-; GFX942-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v3
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0
+; GFX942-NEXT:    v_mov_b32_e32 v31, 0
+; GFX942-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX942-NEXT:  .LBB6_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2072,73 +2892,138 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_accvgpr_read_b32 v21, a21
 ; GFX908-NEXT:    v_accvgpr_read_b32 v22, a22
 ; GFX908-NEXT:    v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT:    v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT:    v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT:    v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT:    v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
-; GFX908-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: test_mfma_loop_agpr_init:
-; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
-; GFX90A-NEXT:    s_nop 15
-; GFX90A-NEXT:    s_nop 2
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a15, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a17, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_agpr_init:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v33, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v32, a0
 ; GFX90A-NEXT:  .LBB8_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v31
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v32
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v33
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v32, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v33, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2158,49 +3043,114 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
 ; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v33, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v32, a0
 ; GFX942-NEXT:  .LBB8_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v31
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v32
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v33
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v32, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v33, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2615,51 +3565,114 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 {
 ; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_mov_b32 s4, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v10, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v11, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v12, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v13, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v14, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v15, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v16, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v17, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v18, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v20, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v21, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v22, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v23, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v24, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v25, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v26, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v27, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v28, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v29, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v30, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v31, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX90A-NEXT:  .LBB10_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    s_add_i32 s4, s4, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
-; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2697,51 +3710,114 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 {
 ; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0
+; GFX942-NEXT:    v_mov_b32_e32 v31, 0
+; GFX942-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX942-NEXT:  .LBB10_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
-; GFX942-NEXT:    s_nop 14
 ; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2875,51 +3951,114 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
 ; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_mov_b32 s4, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v10, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v11, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v12, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v13, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v14, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v15, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v16, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v17, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v18, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v20, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v21, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v22, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v23, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v24, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v25, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v26, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v27, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v28, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v29, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v30, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v31, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX90A-NEXT:  .LBB11_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    s_add_i32 s4, s4, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
-; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2957,51 +4096,114 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
 ; GFX942-LABEL: test_mfma_loop_non_splat_ret_use:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0
+; GFX942-NEXT:    v_mov_b32_e32 v31, 0
+; GFX942-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GFX942-NEXT:  .LBB11_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a31
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
-; GFX942-NEXT:    s_nop 14
 ; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX942-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX942-NEXT:    v_accvgpr_read_b32 v2, a2

diff  --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index 2462414992e36..12efca7dcadb5 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -6,8 +6,8 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-LABEL: matmul_kernel:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
 ; GFX942-NEXT:    s_mov_b32 s3, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 098a60dd61a1c..1156f2718cf1e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -722,8 +722,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x2800, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], 0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x7f
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], 0, 0
 ; GFX90A-NEXT:    s_movk_i32 s2, 0xf000
 ; GFX90A-NEXT:    s_movk_i32 s3, 0x1000
 ; GFX90A-NEXT:    s_movk_i32 s4, 0x2000

diff  --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index f196004e7660b..6b0ede1ac3ab8 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -113,16 +113,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v57, 6
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v57, 7
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_28
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_3
 ; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
-; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow28
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_29
+; GLOBALNESS1-NEXT:    s_branch .LBB1_4
+; GLOBALNESS1-NEXT:  .LBB1_3: ; %bb73.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS1-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
@@ -171,10 +171,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow25
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
-; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_3
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    flat_load_dword v0, v[44:45]
@@ -183,7 +181,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[52:53], s[86:87]
-; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_25
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
@@ -212,7 +210,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[68:69]
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
 ; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -273,7 +271,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_14
-; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow23
+; GLOBALNESS1-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr58_vgpr59
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_4
+; GLOBALNESS1-NEXT:    s_branch .LBB1_29
+; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow23
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[38:39], 0x0
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
@@ -283,25 +285,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_mov_b32 s55, s7
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s9, v57, 11
-; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow24
+; GLOBALNESS1-NEXT:  .LBB1_26: ; %Flow24
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[52:53]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
-; GLOBALNESS1-NEXT:  ; %bb.26: ; %bb67.i
+; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v57, 4
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v57, 5
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
-; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb69.i
+; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_1
-; GLOBALNESS1-NEXT:  .LBB1_28: ; %bb73.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
-; GLOBALNESS1-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS1-NEXT:  .LBB1_29: ; %loop.exit.guard
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
@@ -424,16 +422,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v57, 6
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v57, 7
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_28
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_3
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
-; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow28
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_29
+; GLOBALNESS0-NEXT:    s_branch .LBB1_4
+; GLOBALNESS0-NEXT:  .LBB1_3: ; %bb73.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS0-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
@@ -482,10 +480,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow25
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
-; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_3
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    flat_load_dword v0, v[44:45]
@@ -494,7 +490,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[52:53], s[86:87]
-; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_25
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
@@ -524,7 +520,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[68:69]
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
 ; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -585,7 +581,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
-; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow23
+; GLOBALNESS0-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr58_vgpr59
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_4
+; GLOBALNESS0-NEXT:    s_branch .LBB1_29
+; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow23
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s55, s83
@@ -593,25 +593,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v57, 9
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s8, v57, 10
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s9, v57, 11
-; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow24
+; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow24
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[52:53]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
-; GLOBALNESS0-NEXT:  ; %bb.26: ; %bb67.i
+; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v57, 4
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v57, 5
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
-; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb69.i
+; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_1
-; GLOBALNESS0-NEXT:  .LBB1_28: ; %bb73.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
-; GLOBALNESS0-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS0-NEXT:  .LBB1_29: ; %loop.exit.guard
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1


        


More information about the llvm-commits mailing list