[llvm] a779af3 - [AMDGPU] Change SGPR layout to striped caller/callee saved (#127353)

via llvm-commits llvm-commits at lists.llvm.org
Sat Mar 8 06:28:28 PST 2025


Author: Shilei Tian
Date: 2025-03-08T09:28:20-05:00
New Revision: a779af3f882e1500ee6b9cb973e60c10ea0819b4

URL: https://github.com/llvm/llvm-project/commit/a779af3f882e1500ee6b9cb973e60c10ea0819b4
DIFF: https://github.com/llvm/llvm-project/commit/a779af3f882e1500ee6b9cb973e60c10ea0819b4.diff

LOG: [AMDGPU] Change SGPR layout to striped caller/callee saved (#127353)

This PR updates the SGPR layout to a striped caller/callee-saved design,
similar
to the VGPR layout.

To ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame
pointer), and s34 (base pointer) remain callee-saved, the striped layout
starts
from s40, with a stripe width of 8. The last stripe is 10 wide instead
of 8 to
avoid ending with a 2-wide stripe.

Fixes #113782.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
    llvm/test/CodeGen/AMDGPU/bf16.ll
    llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
    llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
    llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
    llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
    llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
    llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
    llvm/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
    llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
    llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
    llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
    llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
    llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
    llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/issue48473.mir
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
    llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
    llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
    llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
    llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
    llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
    llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
    llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
    llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
    llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
    llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
    llvm/test/CodeGen/AMDGPU/select.f16.ll
    llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
    llvm/test/CodeGen/AMDGPU/sibling-call.ll
    llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
    llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
    llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir
    llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
    llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
    llvm/test/CodeGen/AMDGPU/stack-realign.ll
    llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
    llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir
    llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
    llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
    llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
    llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 80969fce3d77f..e891fdba4e03e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -91,7 +91,15 @@ def CSR_AMDGPU_AGPRs : CalleeSavedRegs<
 >;
 
 def CSR_AMDGPU_SGPRs : CalleeSavedRegs<
-  (sequence "SGPR%u", 30, 105)
+  // Ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame pointer),
+  // and s34 (base pointer) are callee-saved. The striped layout starts from s40,
+  // with a stripe width of 8. The last stripe is 10 wide instead of 8, to avoid
+  // ending with a 2-wide stripe.
+  (add (sequence "SGPR%u", 30, 39),
+       (sequence "SGPR%u", 48, 55),
+       (sequence "SGPR%u", 64, 71),
+       (sequence "SGPR%u", 80, 87),
+       (sequence "SGPR%u", 96, 105))
 >;
 
 def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs<

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 54b4888120e5f..5bda853b76727 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -127,29 +127,29 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
 ; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v2
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b32 s50, s15
+; CHECK-NEXT:    s_mov_b32 s51, s14
+; CHECK-NEXT:    s_mov_b32 s52, s13
+; CHECK-NEXT:    s_mov_b32 s53, s12
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
@@ -160,15 +160,15 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    s_mov_b32 s12, s53
+; CHECK-NEXT:    s_mov_b32 s13, s52
+; CHECK-NEXT:    s_mov_b32 s14, s51
+; CHECK-NEXT:    s_mov_b32 s15, s50
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -178,12 +178,12 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
 ; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
 ; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
 ; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
@@ -267,28 +267,28 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
 ; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v2
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b32 s50, s15
+; CHECK-NEXT:    s_mov_b32 s51, s14
+; CHECK-NEXT:    s_mov_b32 s52, s13
+; CHECK-NEXT:    s_mov_b32 s53, s12
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
@@ -299,26 +299,26 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    s_mov_b32 s12, s53
+; CHECK-NEXT:    s_mov_b32 s13, s52
+; CHECK-NEXT:    s_mov_b32 s14, s51
+; CHECK-NEXT:    s_mov_b32 s15, s50
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v42
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
 ; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
 ; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
 ; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
@@ -409,29 +409,29 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
 ; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v2
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b32 s50, s15
+; CHECK-NEXT:    s_mov_b32 s51, s14
+; CHECK-NEXT:    s_mov_b32 s52, s13
+; CHECK-NEXT:    s_mov_b32 s53, s12
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
@@ -442,15 +442,15 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    s_mov_b32 s12, s53
+; CHECK-NEXT:    s_mov_b32 s13, s52
+; CHECK-NEXT:    s_mov_b32 s14, s51
+; CHECK-NEXT:    s_mov_b32 s15, s50
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -460,12 +460,12 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
 ; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
 ; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
 ; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
@@ -551,26 +551,26 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_writelane_b32 v42, s38, 6
 ; CHECK-NEXT:    v_writelane_b32 v42, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v42, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v42, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v42, s48, 8
+; CHECK-NEXT:    v_writelane_b32 v42, s49, 9
+; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v42, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v42, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v42, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v42, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v42, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v42, s52, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v42, s45, 13
+; CHECK-NEXT:    v_writelane_b32 v42, s53, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b32 s50, s15
+; CHECK-NEXT:    s_mov_b32 s51, s14
+; CHECK-NEXT:    s_mov_b32 s52, s13
+; CHECK-NEXT:    s_mov_b32 s53, s12
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
@@ -582,26 +582,26 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    s_mov_b32 s12, s53
+; CHECK-NEXT:    s_mov_b32 s13, s52
+; CHECK-NEXT:    s_mov_b32 s14, s51
+; CHECK-NEXT:    s_mov_b32 s15, s50
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v42, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v42, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v42, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v42, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v42, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v42, 8
+; CHECK-NEXT:    v_readlane_b32 s53, v42, 13
+; CHECK-NEXT:    v_readlane_b32 s52, v42, 12
+; CHECK-NEXT:    v_readlane_b32 s51, v42, 11
+; CHECK-NEXT:    v_readlane_b32 s50, v42, 10
+; CHECK-NEXT:    v_readlane_b32 s49, v42, 9
+; CHECK-NEXT:    v_readlane_b32 s48, v42, 8
 ; CHECK-NEXT:    v_readlane_b32 s39, v42, 7
 ; CHECK-NEXT:    v_readlane_b32 s38, v42, 6
 ; CHECK-NEXT:    v_readlane_b32 s37, v42, 5
@@ -692,28 +692,28 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
 ; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v41
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b32 s50, s15
+; CHECK-NEXT:    s_mov_b32 s51, s14
+; CHECK-NEXT:    s_mov_b32 s52, s13
+; CHECK-NEXT:    s_mov_b32 s53, s12
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
@@ -725,15 +725,15 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    s_mov_b32 s12, s53
+; CHECK-NEXT:    s_mov_b32 s13, s52
+; CHECK-NEXT:    s_mov_b32 s14, s51
+; CHECK-NEXT:    s_mov_b32 s15, s50
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -742,12 +742,12 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
 ; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
 ; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
 ; CHECK-NEXT:    v_readlane_b32 s37, v43, 5

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index aafdb1c8cc36f..91598496eb984 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -40671,14 +40671,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GCN-LABEL: v_vselect_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v31, s30, 0
-; GCN-NEXT:    v_writelane_b32 v31, s31, 1
-; GCN-NEXT:    v_writelane_b32 v31, s34, 2
-; GCN-NEXT:    v_writelane_b32 v31, s35, 3
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v1
@@ -40716,21 +40708,21 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GCN-NEXT:    v_and_b32_e32 v8, 1, v14
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v7
 ; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v8
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v8
 ; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    v_and_b32_e32 v9, 1, v15
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v9
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v9
 ; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v15, v8, v7, s[34:35]
+; GCN-NEXT:    v_cndmask_b32_e64 v15, v8, v7, s[42:43]
 ; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:56
 ; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v30
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT:    v_cndmask_b32_e64 v14, v9, v8, s[30:31]
+; GCN-NEXT:    v_cndmask_b32_e64 v14, v9, v8, s[40:41]
 ; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:52
 ; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v29
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
@@ -40806,14 +40798,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT:    v_readlane_b32 s35, v31, 3
-; GCN-NEXT:    v_readlane_b32 s34, v31, 2
-; GCN-NEXT:    v_readlane_b32 s31, v31, 1
-; GCN-NEXT:    v_readlane_b32 s30, v31, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_vselect_v16bf16:
@@ -40954,9 +40938,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX8-LABEL: v_vselect_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
@@ -40982,17 +40963,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v11
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v1, v0, s[28:29]
@@ -41018,9 +40995,9 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v0, v23, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v0, v23, s[40:41]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v0, v1, s[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v0, v1, s[42:43]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v1, v0, s[16:17]
@@ -41043,14 +41020,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v16bf16:
@@ -42030,108 +41999,80 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX8-NEXT:    v_writelane_b32 v34, s36, 4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:    v_writelane_b32 v34, s37, 5
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX8-NEXT:    v_writelane_b32 v34, s38, 6
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
-; GFX8-NEXT:    v_writelane_b32 v34, s39, 7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX8-NEXT:    v_writelane_b32 v34, s40, 8
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX8-NEXT:    v_writelane_b32 v34, s41, 9
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v7
-; GFX8-NEXT:    v_writelane_b32 v34, s42, 10
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v8
-; GFX8-NEXT:    v_writelane_b32 v34, s43, 11
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v9
-; GFX8-NEXT:    v_writelane_b32 v34, s44, 12
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v10
-; GFX8-NEXT:    v_writelane_b32 v34, s45, 13
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v11
-; GFX8-NEXT:    v_writelane_b32 v34, s46, 14
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX8-NEXT:    v_writelane_b32 v34, s47, 15
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX8-NEXT:    v_writelane_b32 v34, s48, 16
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX8-NEXT:    v_writelane_b32 v34, s49, 17
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX8-NEXT:    v_writelane_b32 v34, s50, 18
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v16
-; GFX8-NEXT:    v_writelane_b32 v34, s51, 19
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v17
-; GFX8-NEXT:    v_writelane_b32 v34, s52, 20
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v18
-; GFX8-NEXT:    v_writelane_b32 v34, s53, 21
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v19
-; GFX8-NEXT:    v_writelane_b32 v34, s54, 22
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX8-NEXT:    v_writelane_b32 v34, s55, 23
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v21
-; GFX8-NEXT:    v_writelane_b32 v34, s56, 24
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v22
-; GFX8-NEXT:    v_writelane_b32 v34, s57, 25
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[72:73], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v23
-; GFX8-NEXT:    v_writelane_b32 v34, s58, 26
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[74:75], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v24
-; GFX8-NEXT:    v_writelane_b32 v34, s59, 27
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[76:77], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT:    v_writelane_b32 v34, s60, 28
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
+; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[78:79], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT:    v_writelane_b32 v34, s61, 29
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
+; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[88:89], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT:    v_writelane_b32 v34, s62, 30
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
+; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[90:91], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT:    v_writelane_b32 v34, s63, 31
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
+; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX8-NEXT:    v_writelane_b32 v34, s64, 32
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
+; GFX8-NEXT:    v_writelane_b32 v34, s36, 4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX8-NEXT:    v_writelane_b32 v34, s65, 33
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
+; GFX8-NEXT:    v_writelane_b32 v34, s37, 5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX8-NEXT:    v_writelane_b32 v34, s66, 34
-; GFX8-NEXT:    v_writelane_b32 v34, s67, 35
+; GFX8-NEXT:    v_writelane_b32 v34, s38, 6
+; GFX8-NEXT:    v_writelane_b32 v34, s39, 7
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
 ; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
 ; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
@@ -42168,40 +42109,40 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v32
-; GFX8-NEXT:    v_cndmask_b32_e64 v28, v33, v28, s[66:67]
-; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[64:65]
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v33, v28, s[38:39]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[36:37]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
-; GFX8-NEXT:    v_cndmask_b32_e64 v32, v33, v32, s[62:63]
-; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v31, s[60:61]
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v33, v32, s[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v31, s[30:31]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v31, s[58:59]
-; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v31, s[90:91]
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[88:89]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v27, v33, v27, s[54:55]
-; GFX8-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v33, v27, s[78:79]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[76:77]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX8-NEXT:    v_cndmask_b32_e64 v25, v33, v25, s[50:51]
-; GFX8-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v33, v25, s[74:75]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[72:73]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX8-NEXT:    v_cndmask_b32_e64 v23, v33, v23, s[46:47]
-; GFX8-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v33, v23, s[62:63]
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[60:61]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
-; GFX8-NEXT:    v_cndmask_b32_e64 v21, v33, v21, s[42:43]
-; GFX8-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v33, v21, s[58:59]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[56:57]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v19, v33, v19, s[38:39]
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v33, v19, s[46:47]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[44:45]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, v33, v17, s[34:35]
-; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v33, v17, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[40:41]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v15, v33, v15, s[28:29]
@@ -42262,34 +42203,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readlane_b32 s67, v34, 35
-; GFX8-NEXT:    v_readlane_b32 s66, v34, 34
-; GFX8-NEXT:    v_readlane_b32 s65, v34, 33
-; GFX8-NEXT:    v_readlane_b32 s64, v34, 32
-; GFX8-NEXT:    v_readlane_b32 s63, v34, 31
-; GFX8-NEXT:    v_readlane_b32 s62, v34, 30
-; GFX8-NEXT:    v_readlane_b32 s61, v34, 29
-; GFX8-NEXT:    v_readlane_b32 s60, v34, 28
-; GFX8-NEXT:    v_readlane_b32 s59, v34, 27
-; GFX8-NEXT:    v_readlane_b32 s58, v34, 26
-; GFX8-NEXT:    v_readlane_b32 s57, v34, 25
-; GFX8-NEXT:    v_readlane_b32 s56, v34, 24
-; GFX8-NEXT:    v_readlane_b32 s55, v34, 23
-; GFX8-NEXT:    v_readlane_b32 s54, v34, 22
-; GFX8-NEXT:    v_readlane_b32 s53, v34, 21
-; GFX8-NEXT:    v_readlane_b32 s52, v34, 20
-; GFX8-NEXT:    v_readlane_b32 s51, v34, 19
-; GFX8-NEXT:    v_readlane_b32 s50, v34, 18
-; GFX8-NEXT:    v_readlane_b32 s49, v34, 17
-; GFX8-NEXT:    v_readlane_b32 s48, v34, 16
-; GFX8-NEXT:    v_readlane_b32 s47, v34, 15
-; GFX8-NEXT:    v_readlane_b32 s46, v34, 14
-; GFX8-NEXT:    v_readlane_b32 s45, v34, 13
-; GFX8-NEXT:    v_readlane_b32 s44, v34, 12
-; GFX8-NEXT:    v_readlane_b32 s43, v34, 11
-; GFX8-NEXT:    v_readlane_b32 s42, v34, 10
-; GFX8-NEXT:    v_readlane_b32 s41, v34, 9
-; GFX8-NEXT:    v_readlane_b32 s40, v34, 8
 ; GFX8-NEXT:    v_readlane_b32 s39, v34, 7
 ; GFX8-NEXT:    v_readlane_b32 s38, v34, 6
 ; GFX8-NEXT:    v_readlane_b32 s37, v34, 5
@@ -42310,108 +42223,76 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v33, s34, 2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_writelane_b32 v33, s35, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX9-NEXT:    v_writelane_b32 v33, s36, 4
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:    v_writelane_b32 v33, s37, 5
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX9-NEXT:    v_writelane_b32 v33, s38, 6
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
-; GFX9-NEXT:    v_writelane_b32 v33, s39, 7
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
-; GFX9-NEXT:    v_writelane_b32 v33, s40, 8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX9-NEXT:    v_writelane_b32 v33, s41, 9
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
-; GFX9-NEXT:    v_writelane_b32 v33, s42, 10
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
-; GFX9-NEXT:    v_writelane_b32 v33, s43, 11
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
-; GFX9-NEXT:    v_writelane_b32 v33, s44, 12
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
-; GFX9-NEXT:    v_writelane_b32 v33, s45, 13
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX9-NEXT:    v_writelane_b32 v33, s46, 14
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX9-NEXT:    v_writelane_b32 v33, s47, 15
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX9-NEXT:    v_writelane_b32 v33, s48, 16
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX9-NEXT:    v_writelane_b32 v33, s49, 17
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
-; GFX9-NEXT:    v_writelane_b32 v33, s50, 18
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
-; GFX9-NEXT:    v_writelane_b32 v33, s51, 19
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v19
-; GFX9-NEXT:    v_writelane_b32 v33, s52, 20
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
-; GFX9-NEXT:    v_writelane_b32 v33, s53, 21
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v21
-; GFX9-NEXT:    v_writelane_b32 v33, s54, 22
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT:    v_writelane_b32 v33, s55, 23
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v23
-; GFX9-NEXT:    v_writelane_b32 v33, s56, 24
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[72:73], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v22
-; GFX9-NEXT:    v_writelane_b32 v33, s57, 25
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[74:75], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX9-NEXT:    v_writelane_b32 v33, s58, 26
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[76:77], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v24
-; GFX9-NEXT:    v_writelane_b32 v33, s59, 27
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[78:79], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX9-NEXT:    v_writelane_b32 v33, s60, 28
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[88:89], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX9-NEXT:    v_writelane_b32 v33, s61, 29
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[90:91], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX9-NEXT:    v_writelane_b32 v33, s62, 30
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[92:93], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX9-NEXT:    v_writelane_b32 v33, s63, 31
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[94:95], 1, v0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX9-NEXT:    v_writelane_b32 v33, s64, 32
-; GFX9-NEXT:    v_writelane_b32 v33, s65, 33
-; GFX9-NEXT:    v_writelane_b32 v33, s66, 34
+; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v33, s34, 2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_writelane_b32 v33, s67, 35
+; GFX9-NEXT:    v_writelane_b32 v33, s35, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
@@ -42445,42 +42326,42 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v29, v31, v32, s[66:67]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v31, v32, s[34:35]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[64:65]
-; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v30, s[62:63]
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v30, s[94:95]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v30, s[60:61]
-; GFX9-NEXT:    v_cndmask_b32_e64 v30, v26, v27, s[58:59]
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v30, s[92:93]
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v26, v27, s[90:91]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
-; GFX9-NEXT:    v_cndmask_b32_e64 v27, v24, v25, s[54:55]
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[88:89]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v24, v25, s[78:79]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
-; GFX9-NEXT:    v_cndmask_b32_e64 v25, v22, v23, s[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[76:77]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v22, v23, s[74:75]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
-; GFX9-NEXT:    v_cndmask_b32_e64 v23, v20, v21, s[46:47]
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[72:73]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v20, v21, s[62:63]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
-; GFX9-NEXT:    v_cndmask_b32_e64 v21, v18, v19, s[42:43]
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[60:61]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v18, v19, s[58:59]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, v16, v17, s[38:39]
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[56:57]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v16, v17, s[46:47]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[44:45]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[42:43]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[40:41]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v15, v12, v13, s[28:29]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
@@ -42526,38 +42407,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    v_perm_b32 v13, v26, v30, s4
 ; GFX9-NEXT:    v_perm_b32 v14, v28, v32, s4
 ; GFX9-NEXT:    v_perm_b32 v15, v31, v29, s4
-; GFX9-NEXT:    v_readlane_b32 s67, v33, 35
-; GFX9-NEXT:    v_readlane_b32 s66, v33, 34
-; GFX9-NEXT:    v_readlane_b32 s65, v33, 33
-; GFX9-NEXT:    v_readlane_b32 s64, v33, 32
-; GFX9-NEXT:    v_readlane_b32 s63, v33, 31
-; GFX9-NEXT:    v_readlane_b32 s62, v33, 30
-; GFX9-NEXT:    v_readlane_b32 s61, v33, 29
-; GFX9-NEXT:    v_readlane_b32 s60, v33, 28
-; GFX9-NEXT:    v_readlane_b32 s59, v33, 27
-; GFX9-NEXT:    v_readlane_b32 s58, v33, 26
-; GFX9-NEXT:    v_readlane_b32 s57, v33, 25
-; GFX9-NEXT:    v_readlane_b32 s56, v33, 24
-; GFX9-NEXT:    v_readlane_b32 s55, v33, 23
-; GFX9-NEXT:    v_readlane_b32 s54, v33, 22
-; GFX9-NEXT:    v_readlane_b32 s53, v33, 21
-; GFX9-NEXT:    v_readlane_b32 s52, v33, 20
-; GFX9-NEXT:    v_readlane_b32 s51, v33, 19
-; GFX9-NEXT:    v_readlane_b32 s50, v33, 18
-; GFX9-NEXT:    v_readlane_b32 s49, v33, 17
-; GFX9-NEXT:    v_readlane_b32 s48, v33, 16
-; GFX9-NEXT:    v_readlane_b32 s47, v33, 15
-; GFX9-NEXT:    v_readlane_b32 s46, v33, 14
-; GFX9-NEXT:    v_readlane_b32 s45, v33, 13
-; GFX9-NEXT:    v_readlane_b32 s44, v33, 12
-; GFX9-NEXT:    v_readlane_b32 s43, v33, 11
-; GFX9-NEXT:    v_readlane_b32 s42, v33, 10
-; GFX9-NEXT:    v_readlane_b32 s41, v33, 9
-; GFX9-NEXT:    v_readlane_b32 s40, v33, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v33, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v33, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v33, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v33, 4
 ; GFX9-NEXT:    v_readlane_b32 s35, v33, 3
 ; GFX9-NEXT:    v_readlane_b32 s34, v33, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v33, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index f9ffa5ae57f3e..36fa7b97b3c77 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -9,24 +9,24 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_addc_u32 s13, s13, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; CHECK-NEXT:    s_load_dwordx8 s[36:43], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dwordx8 s[48:55], s[8:9], 0x0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b32 s12, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_cmp_lg_u32 s40, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s52, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_8
 ; CHECK-NEXT:  ; %bb.1: ; %if.end13.i.i
-; CHECK-NEXT:    s_cmp_eq_u32 s42, 0
+; CHECK-NEXT:    s_cmp_eq_u32 s54, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_4
 ; CHECK-NEXT:  ; %bb.2: ; %if.else251.i.i
-; CHECK-NEXT:    s_cmp_lg_u32 s43, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s55, 0
 ; CHECK-NEXT:    s_mov_b32 s17, 0
 ; CHECK-NEXT:    s_cselect_b32 s12, -1, 0
 ; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s12
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    s_mov_b32 s36, 0
+; CHECK-NEXT:    s_mov_b32 s48, 0
 ; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s12
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_6
 ; CHECK-NEXT:    s_branch .LBB0_7
@@ -34,16 +34,16 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_mov_b32 s14, s12
 ; CHECK-NEXT:    s_mov_b32 s15, s12
 ; CHECK-NEXT:    s_mov_b32 s13, s12
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[14:15]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[12:13]
+; CHECK-NEXT:    s_mov_b64 s[50:51], s[14:15]
+; CHECK-NEXT:    s_mov_b64 s[48:49], s[12:13]
 ; CHECK-NEXT:    s_branch .LBB0_7
 ; CHECK-NEXT:  .LBB0_5: ; %if.then263.i.i
-; CHECK-NEXT:    v_cmp_lt_f32_e64 s12, s41, 0
-; CHECK-NEXT:    s_mov_b32 s36, 1.0
+; CHECK-NEXT:    v_cmp_lt_f32_e64 s12, s53, 0
+; CHECK-NEXT:    s_mov_b32 s48, 1.0
 ; CHECK-NEXT:    s_mov_b32 s17, 0x7fc00000
-; CHECK-NEXT:    s_mov_b32 s37, s36
-; CHECK-NEXT:    s_mov_b32 s38, s36
-; CHECK-NEXT:    s_mov_b32 s39, s36
+; CHECK-NEXT:    s_mov_b32 s49, s48
+; CHECK-NEXT:    s_mov_b32 s50, s48
+; CHECK-NEXT:    s_mov_b32 s51, s48
 ; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s12
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_7
 ; CHECK-NEXT:  .LBB0_6: ; %if.end273.i.i
@@ -55,7 +55,7 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[18:19], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 10, v1
-; CHECK-NEXT:    v_add_f32_e64 v1, s17, s36
+; CHECK-NEXT:    v_add_f32_e64 v1, s17, s48
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; CHECK-NEXT:    s_mov_b32 s12, s14
@@ -65,13 +65,13 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s14, s16
-; CHECK-NEXT:    s_mov_b32 s36, 0
+; CHECK-NEXT:    s_mov_b32 s48, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s37, s36
-; CHECK-NEXT:    s_mov_b32 s38, s36
-; CHECK-NEXT:    s_mov_b32 s39, s36
+; CHECK-NEXT:    s_mov_b32 s49, s48
+; CHECK-NEXT:    s_mov_b32 s50, s48
+; CHECK-NEXT:    s_mov_b32 s51, s48
 ; CHECK-NEXT:  .LBB0_7: ; %if.end294.i.i
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
@@ -80,11 +80,11 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CHECK-NEXT:  .LBB0_8: ; %kernel_direct_lighting.exit
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x20
-; CHECK-NEXT:    v_mov_b32_e32 v0, s36
+; CHECK-NEXT:    v_mov_b32_e32 v0, s48
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s37
-; CHECK-NEXT:    v_mov_b32_e32 v2, s38
-; CHECK-NEXT:    v_mov_b32_e32 v3, s39
+; CHECK-NEXT:    v_mov_b32_e32 v1, s49
+; CHECK-NEXT:    v_mov_b32_e32 v2, s50
+; CHECK-NEXT:    v_mov_b32_e32 v3, s51
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; CHECK-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 98136347ab702..e43a021802644 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -17,7 +17,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
   ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
@@ -33,7 +33,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.1.bb103:
   ; GFX90A-NEXT:   successors: %bb.58(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -41,7 +41,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr44, $sgpr45, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr23 = IMPLICIT_DEF
@@ -54,7 +54,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3.Flow17:
   ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.57(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
@@ -62,7 +62,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
   ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.5(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec
@@ -79,15 +79,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
@@ -111,7 +111,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.6.Flow20:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr19 = COPY renamable $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr18 = COPY $sgpr17, implicit $exec
@@ -124,15 +124,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.7.Flow19:
   ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.8(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.62, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.8.Flow32:
   ; GFX90A-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -141,58 +141,58 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.9.bb89:
   ; GFX90A-NEXT:   successors: %bb.10(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.10.Flow33:
   ; GFX90A-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.12, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.11.bb84:
   ; GFX90A-NEXT:   successors: %bb.12(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.12.Flow34:
   ; GFX90A-NEXT:   successors: %bb.13(0x40000000), %bb.14(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.14, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.13.bb79:
   ; GFX90A-NEXT:   successors: %bb.14(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.14.Flow35:
   ; GFX90A-NEXT:   successors: %bb.15(0x40000000), %bb.16(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.16, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.15.bb72:
   ; GFX90A-NEXT:   successors: %bb.16(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc
@@ -202,113 +202,113 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   $sgpr13 = COPY killed renamable $sgpr15
   ; GFX90A-NEXT:   $sgpr14 = COPY killed renamable $sgpr16
   ; GFX90A-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.16.Flow36:
   ; GFX90A-NEXT:   successors: %bb.17(0x40000000), %bb.18(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr66_sgpr67, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.18, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.17.bb67:
   ; GFX90A-NEXT:   successors: %bb.18(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.18.Flow37:
   ; GFX90A-NEXT:   successors: %bb.19(0x40000000), %bb.20(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr64_sgpr65, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.20, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.19.bb62:
   ; GFX90A-NEXT:   successors: %bb.20(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.20.Flow38:
   ; GFX90A-NEXT:   successors: %bb.21(0x40000000), %bb.22(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.22, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.21.bb54:
   ; GFX90A-NEXT:   successors: %bb.22(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.22.Flow39:
   ; GFX90A-NEXT:   successors: %bb.23(0x40000000), %bb.24(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.24, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.23.bb47:
   ; GFX90A-NEXT:   successors: %bb.24(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.24.Flow40:
   ; GFX90A-NEXT:   successors: %bb.25(0x40000000), %bb.26(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.26, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.25.bb40:
   ; GFX90A-NEXT:   successors: %bb.26(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.26.Flow41:
   ; GFX90A-NEXT:   successors: %bb.27(0x40000000), %bb.28(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.28, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.27.bb33:
   ; GFX90A-NEXT:   successors: %bb.28(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.28.Flow42:
   ; GFX90A-NEXT:   successors: %bb.34(0x40000000), %bb.29(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -317,7 +317,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.29.Flow43:
   ; GFX90A-NEXT:   successors: %bb.30(0x40000000), %bb.31(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
@@ -325,17 +325,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.30.bb19:
   ; GFX90A-NEXT:   successors: %bb.31(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.31.Flow44:
   ; GFX90A-NEXT:   successors: %bb.32(0x40000000), %bb.33(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr56_sgpr57, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr68_sgpr69, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr68_sgpr69, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.33, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock:
@@ -351,32 +351,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.34.bb26:
   ; GFX90A-NEXT:   successors: %bb.29(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.29
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.35.bb20:
   ; GFX90A-NEXT:   successors: %bb.37(0x40000000), %bb.36(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -400,19 +400,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.36.Flow21:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.6
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.37.bb27:
   ; GFX90A-NEXT:   successors: %bb.39(0x40000000), %bb.38(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr46_sgpr47, $sgpr42_sgpr43, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -437,33 +440,34 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.38.Flow22:
   ; GFX90A-NEXT:   successors: %bb.36(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr44_sgpr45, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.36
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.39.bb34:
   ; GFX90A-NEXT:   successors: %bb.41(0x40000000), %bb.40(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr66_sgpr67
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -487,38 +491,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.40.Flow23:
   ; GFX90A-NEXT:   successors: %bb.38(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.38
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.41.bb41:
   ; GFX90A-NEXT:   successors: %bb.46(0x40000000), %bb.42(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
   ; GFX90A-NEXT:   renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1)
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -539,41 +542,41 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.42.Flow24:
   ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.40
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.43.bb55:
   ; GFX90A-NEXT:   successors: %bb.48(0x40000000), %bb.44(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.48, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.44:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr56, $vgpr47, $vgpr18, $vgpr30, $vgpr31, $vgpr58, $vgpr61, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr57, $vgpr63, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr46, $vgpr45, $vgpr2, $vgpr3, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr62
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr56, $vgpr47, $vgpr18, $vgpr30, $vgpr31, $vgpr58, $vgpr61, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr57, $vgpr63, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr46, $vgpr45, $vgpr2, $vgpr3, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr62
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -587,36 +590,36 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.45.Flow26:
   ; GFX90A-NEXT:   successors: %bb.47(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.46.bb48:
   ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.47(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
   ; GFX90A-NEXT:   renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1)
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
@@ -640,51 +643,51 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.47.Flow25:
   ; GFX90A-NEXT:   successors: %bb.42(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.42
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.48.bb63:
   ; GFX90A-NEXT:   successors: %bb.50(0x40000000), %bb.49(0x40000000)
-  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.50, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.49:
   ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
   ; GFX90A-NEXT:   S_BRANCH %bb.44
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.50.bb68:
   ; GFX90A-NEXT:   successors: %bb.54(0x40000000), %bb.51(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.51:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -701,20 +704,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.52.bb80:
   ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.53(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.59, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.53:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
@@ -730,13 +733,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.54.bb73:
   ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
@@ -756,14 +759,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.55.Flow29:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.56.bb90:
   ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
@@ -772,12 +775,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr56, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr10, 1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr44, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr45, killed $vgpr10, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.60
@@ -788,15 +791,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
@@ -821,7 +824,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.58.bb105:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
@@ -840,13 +843,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.59.bb85:
   ; GFX90A-NEXT:   successors: %bb.56(0x40000000), %bb.60(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
   ; GFX90A-NEXT:   renamable $sgpr17 = S_MOV_B32 0
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
@@ -857,31 +860,31 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
-  ; GFX90A-NEXT:   $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.56, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.60.Flow31:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.61.Flow30:
   ; GFX90A-NEXT:   successors: %bb.55(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr52_sgpr53, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.62.bb140:
   ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.63(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -889,14 +892,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.63.Flow13:
   ; GFX90A-NEXT:   successors: %bb.64(0x40000000), %bb.66(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.66, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.64.bb159:
   ; GFX90A-NEXT:   successors: %bb.67(0x40000000), %bb.65(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -905,21 +908,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.65.Flow10:
   ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.66.Flow14:
   ; GFX90A-NEXT:   successors: %bb.8(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = COPY $exec
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = COPY $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.8
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.67.bb161:
   ; GFX90A-NEXT:   successors: %bb.65(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
@@ -938,7 +941,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.68.bb174:
   ; GFX90A-NEXT:   successors: %bb.72(0x40000000), %bb.69(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
@@ -954,14 +957,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.69.Flow:
   ; GFX90A-NEXT:   successors: %bb.70(0x40000000), %bb.71(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.71, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.70.bb186:
   ; GFX90A-NEXT:   successors: %bb.71(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec
@@ -990,14 +993,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.71.Flow9:
   ; GFX90A-NEXT:   successors: %bb.63(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.63
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.72.bb196:
   ; GFX90A-NEXT:   successors: %bb.69(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 05c2e0077f4ae..83ab6c32aee96 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -900,9 +900,8 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt expcnt(1)
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    v_writelane_b32 v0, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v0, s31, 1
 ; CHECK-NEXT:    v_writelane_b32 v0, s33, 2
@@ -912,71 +911,38 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v0, s37, 6
 ; CHECK-NEXT:    v_writelane_b32 v0, s38, 7
 ; CHECK-NEXT:    v_writelane_b32 v0, s39, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s40, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s41, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s42, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s43, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s44, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s45, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s46, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s47, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s48, 17
-; CHECK-NEXT:    v_writelane_b32 v0, s49, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s50, 19
-; CHECK-NEXT:    v_writelane_b32 v0, s51, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s52, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s53, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s54, 23
-; CHECK-NEXT:    v_writelane_b32 v0, s55, 24
-; CHECK-NEXT:    v_writelane_b32 v0, s56, 25
-; CHECK-NEXT:    v_writelane_b32 v0, s57, 26
-; CHECK-NEXT:    v_writelane_b32 v0, s58, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s59, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s60, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s61, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s62, 31
-; CHECK-NEXT:    v_writelane_b32 v0, s63, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s64, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s65, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s66, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s67, 36
-; CHECK-NEXT:    v_writelane_b32 v0, s68, 37
-; CHECK-NEXT:    v_writelane_b32 v0, s69, 38
-; CHECK-NEXT:    v_writelane_b32 v0, s70, 39
-; CHECK-NEXT:    v_writelane_b32 v0, s71, 40
-; CHECK-NEXT:    v_writelane_b32 v0, s72, 41
-; CHECK-NEXT:    v_writelane_b32 v0, s73, 42
-; CHECK-NEXT:    v_writelane_b32 v0, s74, 43
-; CHECK-NEXT:    v_writelane_b32 v0, s75, 44
-; CHECK-NEXT:    v_writelane_b32 v0, s76, 45
-; CHECK-NEXT:    v_writelane_b32 v0, s77, 46
-; CHECK-NEXT:    v_writelane_b32 v0, s78, 47
-; CHECK-NEXT:    v_writelane_b32 v0, s79, 48
-; CHECK-NEXT:    v_writelane_b32 v0, s80, 49
-; CHECK-NEXT:    v_writelane_b32 v0, s81, 50
-; CHECK-NEXT:    v_writelane_b32 v0, s82, 51
-; CHECK-NEXT:    v_writelane_b32 v0, s83, 52
-; CHECK-NEXT:    v_writelane_b32 v0, s84, 53
-; CHECK-NEXT:    v_writelane_b32 v0, s85, 54
-; CHECK-NEXT:    v_writelane_b32 v0, s86, 55
-; CHECK-NEXT:    v_writelane_b32 v0, s87, 56
-; CHECK-NEXT:    v_writelane_b32 v0, s88, 57
-; CHECK-NEXT:    s_waitcnt expcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v1, s95, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s89, 58
-; CHECK-NEXT:    v_writelane_b32 v1, s96, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s90, 59
-; CHECK-NEXT:    v_writelane_b32 v1, s97, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s91, 60
-; CHECK-NEXT:    v_writelane_b32 v1, s98, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s92, 61
-; CHECK-NEXT:    v_writelane_b32 v1, s99, 4
-; CHECK-NEXT:    s_mov_b32 s31, s12
-; CHECK-NEXT:    v_writelane_b32 v0, s93, 62
-; CHECK-NEXT:    v_writelane_b32 v1, s100, 5
-; CHECK-NEXT:    s_cmp_eq_u32 s31, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s94, 63
-; CHECK-NEXT:    v_writelane_b32 v1, s101, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s48, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s49, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s50, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s51, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s52, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s53, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s54, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s55, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s64, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s65, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s66, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s67, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s68, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s69, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s70, 23
+; CHECK-NEXT:    v_writelane_b32 v0, s71, 24
+; CHECK-NEXT:    v_writelane_b32 v0, s80, 25
+; CHECK-NEXT:    v_writelane_b32 v0, s81, 26
+; CHECK-NEXT:    v_writelane_b32 v0, s82, 27
+; CHECK-NEXT:    v_writelane_b32 v0, s83, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s84, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s85, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s86, 31
+; CHECK-NEXT:    v_writelane_b32 v0, s87, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s96, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s97, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s98, 35
+; CHECK-NEXT:    v_writelane_b32 v0, s99, 36
+; CHECK-NEXT:    s_mov_b32 s40, s12
+; CHECK-NEXT:    v_writelane_b32 v0, s100, 37
+; CHECK-NEXT:    s_cmp_eq_u32 s40, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s101, 38
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -1292,9 +1258,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB1_1
 ; CHECK-NEXT:  ; %bb.3: ; %entry
 ; CHECK-NEXT:    s_not_b64 exec, exec
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 0
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 1
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; CHECK-NEXT:    v_writelane_b32 v1, s0, 0
+; CHECK-NEXT:    v_writelane_b32 v1, s1, 1
 ; CHECK-NEXT:    s_getpc_b64 s[0:1]
 ; CHECK-NEXT:  .Lpost_getpc1:
 ; CHECK-NEXT:    s_add_u32 s0, s0, (.LBB1_4-.Lpost_getpc1)&4294967295
@@ -1313,9 +1279,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    s_branch .LBB1_2
 ; CHECK-NEXT:  .LBB1_4: ; %bb3
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 1
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; CHECK-NEXT:    v_readlane_b32 s0, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s1, v1, 1
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
 ; CHECK-NEXT:    s_not_b64 exec, exec
 ; CHECK-NEXT:  .LBB1_2: ; %bb3
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -1630,68 +1596,36 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use vcc_hi
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s101, v1, 6
-; CHECK-NEXT:    v_readlane_b32 s100, v1, 5
-; CHECK-NEXT:    v_readlane_b32 s99, v1, 4
-; CHECK-NEXT:    v_readlane_b32 s98, v1, 3
-; CHECK-NEXT:    v_readlane_b32 s97, v1, 2
-; CHECK-NEXT:    v_readlane_b32 s96, v1, 1
-; CHECK-NEXT:    v_readlane_b32 s95, v1, 0
-; CHECK-NEXT:    v_readlane_b32 s94, v0, 63
-; CHECK-NEXT:    v_readlane_b32 s93, v0, 62
-; CHECK-NEXT:    v_readlane_b32 s92, v0, 61
-; CHECK-NEXT:    v_readlane_b32 s91, v0, 60
-; CHECK-NEXT:    v_readlane_b32 s90, v0, 59
-; CHECK-NEXT:    v_readlane_b32 s89, v0, 58
-; CHECK-NEXT:    v_readlane_b32 s88, v0, 57
-; CHECK-NEXT:    v_readlane_b32 s87, v0, 56
-; CHECK-NEXT:    v_readlane_b32 s86, v0, 55
-; CHECK-NEXT:    v_readlane_b32 s85, v0, 54
-; CHECK-NEXT:    v_readlane_b32 s84, v0, 53
-; CHECK-NEXT:    v_readlane_b32 s83, v0, 52
-; CHECK-NEXT:    v_readlane_b32 s82, v0, 51
-; CHECK-NEXT:    v_readlane_b32 s81, v0, 50
-; CHECK-NEXT:    v_readlane_b32 s80, v0, 49
-; CHECK-NEXT:    v_readlane_b32 s79, v0, 48
-; CHECK-NEXT:    v_readlane_b32 s78, v0, 47
-; CHECK-NEXT:    v_readlane_b32 s77, v0, 46
-; CHECK-NEXT:    v_readlane_b32 s76, v0, 45
-; CHECK-NEXT:    v_readlane_b32 s75, v0, 44
-; CHECK-NEXT:    v_readlane_b32 s74, v0, 43
-; CHECK-NEXT:    v_readlane_b32 s73, v0, 42
-; CHECK-NEXT:    v_readlane_b32 s72, v0, 41
-; CHECK-NEXT:    v_readlane_b32 s71, v0, 40
-; CHECK-NEXT:    v_readlane_b32 s70, v0, 39
-; CHECK-NEXT:    v_readlane_b32 s69, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s68, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s67, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s66, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s65, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s64, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s63, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s62, v0, 31
-; CHECK-NEXT:    v_readlane_b32 s61, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s60, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s59, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s58, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s57, v0, 26
-; CHECK-NEXT:    v_readlane_b32 s56, v0, 25
-; CHECK-NEXT:    v_readlane_b32 s55, v0, 24
-; CHECK-NEXT:    v_readlane_b32 s54, v0, 23
-; CHECK-NEXT:    v_readlane_b32 s53, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s52, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s51, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s50, v0, 19
-; CHECK-NEXT:    v_readlane_b32 s49, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s48, v0, 17
-; CHECK-NEXT:    v_readlane_b32 s47, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s46, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s45, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s44, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s43, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s42, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s41, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s40, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s101, v0, 38
+; CHECK-NEXT:    v_readlane_b32 s100, v0, 37
+; CHECK-NEXT:    v_readlane_b32 s99, v0, 36
+; CHECK-NEXT:    v_readlane_b32 s98, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s97, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s96, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s87, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s86, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s85, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s84, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s83, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s82, v0, 27
+; CHECK-NEXT:    v_readlane_b32 s81, v0, 26
+; CHECK-NEXT:    v_readlane_b32 s80, v0, 25
+; CHECK-NEXT:    v_readlane_b32 s71, v0, 24
+; CHECK-NEXT:    v_readlane_b32 s70, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s69, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s68, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s67, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s66, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s65, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s64, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s55, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s54, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s53, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s52, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s51, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s50, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s49, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s48, v0, 9
 ; CHECK-NEXT:    v_readlane_b32 s39, v0, 8
 ; CHECK-NEXT:    v_readlane_b32 s38, v0, 7
 ; CHECK-NEXT:    v_readlane_b32 s37, v0, 6
@@ -1703,7 +1637,6 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    v_readlane_b32 s30, v0, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
index d4c50cf2c7e4a..34f4476f7fd6a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
@@ -1,6 +1,6 @@
-; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s
 
-; CHECK: LLVM ERROR: failed to find free scratch register
+; CHECK: illegal VGPR to SGPR copy
 
 declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
 declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index 0b8ad359ccb94..c10cb0ae6d336 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -1385,15 +1385,15 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s29, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 vcc, -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, vcc
+; GFX9-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-NEXT:    v_writelane_b32 v40, s29, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 vcc
-; GFX9-NEXT:    s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[40:41]
+; GFX9-NEXT:    s_add_u32 s40, s40, external_void_func_a15i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    s_mov_b32 s3, s19
 ; GFX9-NEXT:    s_mov_b32 s2, s18
 ; GFX9-NEXT:    s_mov_b32 s1, s17
@@ -1408,7 +1408,7 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 ; GFX9-NEXT:    s_mov_b32 s23, s27
 ; GFX9-NEXT:    s_mov_b32 s24, s28
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], vcc
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[40:41]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33

diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 9bbecacd6c774..3451e389fef8b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4542,19 +4542,19 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 {
 ; VI-LABEL: test_call_external_i32_func_i32_imm:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s42, -1
-; VI-NEXT:    s_mov_b32 s43, 0xe80000
-; VI-NEXT:    s_add_u32 s40, s40, s5
+; VI-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s50, -1
+; VI-NEXT:    s_mov_b32 s51, 0xe80000
+; VI-NEXT:    s_add_u32 s48, s48, s5
 ; VI-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
-; VI-NEXT:    s_addc_u32 s41, s41, 0
+; VI-NEXT:    s_addc_u32 s49, s49, 0
 ; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; VI-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; VI-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; VI-NEXT:    s_getpc_b64 s[4:5]
 ; VI-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
 ; VI-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
-; VI-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; VI-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 s32, 0
 ; VI-NEXT:    s_mov_b32 s39, 0xf000
@@ -4566,19 +4566,19 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ;
 ; CI-LABEL: test_call_external_i32_func_i32_imm:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; CI-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; CI-NEXT:    s_mov_b32 s42, -1
-; CI-NEXT:    s_mov_b32 s43, 0xe8f000
-; CI-NEXT:    s_add_u32 s40, s40, s5
+; CI-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s50, -1
+; CI-NEXT:    s_mov_b32 s51, 0xe8f000
+; CI-NEXT:    s_add_u32 s48, s48, s5
 ; CI-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
-; CI-NEXT:    s_addc_u32 s41, s41, 0
+; CI-NEXT:    s_addc_u32 s49, s49, 0
 ; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; CI-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; CI-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; CI-NEXT:    s_getpc_b64 s[4:5]
 ; CI-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
 ; CI-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
-; CI-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; CI-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_mov_b32 s39, 0xf000
@@ -4590,19 +4590,19 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ;
 ; GFX9-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s42, -1
-; GFX9-NEXT:    s_mov_b32 s43, 0xe00000
-; GFX9-NEXT:    s_add_u32 s40, s40, s5
+; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s50, -1
+; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-NEXT:    s_add_u32 s48, s48, s5
 ; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
-; GFX9-NEXT:    s_addc_u32 s41, s41, 0
+; GFX9-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_mov_b32 s39, 0xf000

diff  --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index db9ce56ecc3cc..67a70cdeb1ecc 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -528,15 +528,16 @@ define void @callee_saved_sgpr_func() #2 {
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT:    v_writelane_b32 v40, s40, 2
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; def s40
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_mov_b32 s34, s40
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    ;;#ASMSTART
-; MUBUF-NEXT:    ; use s40
+; MUBUF-NEXT:    ; use s34
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s40, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
@@ -563,15 +564,16 @@ define void @callee_saved_sgpr_func() #2 {
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT:    v_writelane_b32 v40, s40, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_mov_b32 s34, s40
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; use s40
+; FLATSCR-NEXT:    ; use s34
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s40, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
@@ -600,9 +602,10 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_mov_b32 s33, s40
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; use s40
+; FLATSCR-NEXT:    ; use s33
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    s_endpgm
   %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
@@ -629,22 +632,23 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT:    v_writelane_b32 v41, s40, 2
+; MUBUF-NEXT:    v_writelane_b32 v41, s34, 2
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; def s40
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_mov_b32 s34, s40
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; def v40
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    ;;#ASMSTART
-; MUBUF-NEXT:    ; use s40
+; MUBUF-NEXT:    ; use s34
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; use v40
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT:    v_readlane_b32 s40, v41, 2
+; MUBUF-NEXT:    v_readlane_b32 s34, v41, 2
 ; MUBUF-NEXT:    v_readlane_b32 s31, v41, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v41, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
@@ -672,22 +676,23 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT:    v_writelane_b32 v41, s40, 2
+; FLATSCR-NEXT:    v_writelane_b32 v41, s34, 2
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_mov_b32 s34, s40
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def v40
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; use s40
+; FLATSCR-NEXT:    ; use s34
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use v40
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT:    v_readlane_b32 s40, v41, 2
+; FLATSCR-NEXT:    v_readlane_b32 s34, v41, 2
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v41, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v41, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
@@ -718,13 +723,14 @@ define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_mov_b32 s33, s40
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def v32
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    v_mov_b32_e32 v40, v32
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; use s40
+; FLATSCR-NEXT:    ; use s33
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use v40

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 4c6f2d22080e0..9561aa555c80e 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -257,50 +257,26 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; MUBUF-NEXT:    v_writelane_b32 v40, s37, 1
 ; MUBUF-NEXT:    v_writelane_b32 v40, s38, 2
 ; MUBUF-NEXT:    v_writelane_b32 v40, s39, 3
-; MUBUF-NEXT:    v_writelane_b32 v40, s40, 4
-; MUBUF-NEXT:    v_writelane_b32 v40, s41, 5
-; MUBUF-NEXT:    v_writelane_b32 v40, s42, 6
-; MUBUF-NEXT:    v_writelane_b32 v40, s43, 7
-; MUBUF-NEXT:    v_writelane_b32 v40, s44, 8
-; MUBUF-NEXT:    v_writelane_b32 v40, s45, 9
-; MUBUF-NEXT:    v_writelane_b32 v40, s46, 10
-; MUBUF-NEXT:    v_writelane_b32 v40, s47, 11
-; MUBUF-NEXT:    v_writelane_b32 v40, s48, 12
-; MUBUF-NEXT:    v_writelane_b32 v40, s49, 13
-; MUBUF-NEXT:    v_writelane_b32 v40, s50, 14
-; MUBUF-NEXT:    v_writelane_b32 v40, s51, 15
-; MUBUF-NEXT:    v_writelane_b32 v40, s52, 16
-; MUBUF-NEXT:    v_writelane_b32 v40, s53, 17
-; MUBUF-NEXT:    v_writelane_b32 v40, s54, 18
-; MUBUF-NEXT:    v_writelane_b32 v40, s55, 19
-; MUBUF-NEXT:    v_writelane_b32 v40, s56, 20
-; MUBUF-NEXT:    v_writelane_b32 v40, s57, 21
-; MUBUF-NEXT:    v_writelane_b32 v40, s58, 22
-; MUBUF-NEXT:    v_writelane_b32 v40, s59, 23
-; MUBUF-NEXT:    v_writelane_b32 v40, s60, 24
-; MUBUF-NEXT:    v_writelane_b32 v40, s61, 25
-; MUBUF-NEXT:    v_writelane_b32 v40, s62, 26
-; MUBUF-NEXT:    v_writelane_b32 v40, s63, 27
-; MUBUF-NEXT:    v_writelane_b32 v40, s64, 28
-; MUBUF-NEXT:    v_writelane_b32 v40, s65, 29
-; MUBUF-NEXT:    v_writelane_b32 v40, s66, 30
-; MUBUF-NEXT:    v_writelane_b32 v40, s67, 31
-; MUBUF-NEXT:    v_writelane_b32 v40, s68, 32
-; MUBUF-NEXT:    v_writelane_b32 v40, s69, 33
-; MUBUF-NEXT:    v_writelane_b32 v40, s70, 34
-; MUBUF-NEXT:    v_writelane_b32 v40, s71, 35
-; MUBUF-NEXT:    v_writelane_b32 v40, s72, 36
-; MUBUF-NEXT:    v_writelane_b32 v40, s73, 37
-; MUBUF-NEXT:    v_writelane_b32 v40, s74, 38
-; MUBUF-NEXT:    v_writelane_b32 v40, s75, 39
-; MUBUF-NEXT:    v_writelane_b32 v40, s76, 40
-; MUBUF-NEXT:    v_writelane_b32 v40, s77, 41
-; MUBUF-NEXT:    v_writelane_b32 v40, s78, 42
-; MUBUF-NEXT:    v_writelane_b32 v40, s79, 43
-; MUBUF-NEXT:    v_writelane_b32 v40, s80, 44
-; MUBUF-NEXT:    v_writelane_b32 v40, s81, 45
-; MUBUF-NEXT:    v_writelane_b32 v40, s82, 46
-; MUBUF-NEXT:    v_writelane_b32 v40, s83, 47
+; MUBUF-NEXT:    v_writelane_b32 v40, s48, 4
+; MUBUF-NEXT:    v_writelane_b32 v40, s49, 5
+; MUBUF-NEXT:    v_writelane_b32 v40, s50, 6
+; MUBUF-NEXT:    v_writelane_b32 v40, s51, 7
+; MUBUF-NEXT:    v_writelane_b32 v40, s52, 8
+; MUBUF-NEXT:    v_writelane_b32 v40, s53, 9
+; MUBUF-NEXT:    v_writelane_b32 v40, s54, 10
+; MUBUF-NEXT:    v_writelane_b32 v40, s55, 11
+; MUBUF-NEXT:    v_writelane_b32 v40, s64, 12
+; MUBUF-NEXT:    v_writelane_b32 v40, s65, 13
+; MUBUF-NEXT:    v_writelane_b32 v40, s66, 14
+; MUBUF-NEXT:    v_writelane_b32 v40, s67, 15
+; MUBUF-NEXT:    v_writelane_b32 v40, s68, 16
+; MUBUF-NEXT:    v_writelane_b32 v40, s69, 17
+; MUBUF-NEXT:    v_writelane_b32 v40, s70, 18
+; MUBUF-NEXT:    v_writelane_b32 v40, s71, 19
+; MUBUF-NEXT:    v_writelane_b32 v40, s80, 20
+; MUBUF-NEXT:    v_writelane_b32 v40, s81, 21
+; MUBUF-NEXT:    v_writelane_b32 v40, s82, 22
+; MUBUF-NEXT:    v_writelane_b32 v40, s83, 23
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    ;;#ASMSTART
@@ -347,50 +323,26 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; use s[4:19]
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s83, v40, 47
-; MUBUF-NEXT:    v_readlane_b32 s82, v40, 46
-; MUBUF-NEXT:    v_readlane_b32 s81, v40, 45
-; MUBUF-NEXT:    v_readlane_b32 s80, v40, 44
-; MUBUF-NEXT:    v_readlane_b32 s79, v40, 43
-; MUBUF-NEXT:    v_readlane_b32 s78, v40, 42
-; MUBUF-NEXT:    v_readlane_b32 s77, v40, 41
-; MUBUF-NEXT:    v_readlane_b32 s76, v40, 40
-; MUBUF-NEXT:    v_readlane_b32 s75, v40, 39
-; MUBUF-NEXT:    v_readlane_b32 s74, v40, 38
-; MUBUF-NEXT:    v_readlane_b32 s73, v40, 37
-; MUBUF-NEXT:    v_readlane_b32 s72, v40, 36
-; MUBUF-NEXT:    v_readlane_b32 s71, v40, 35
-; MUBUF-NEXT:    v_readlane_b32 s70, v40, 34
-; MUBUF-NEXT:    v_readlane_b32 s69, v40, 33
-; MUBUF-NEXT:    v_readlane_b32 s68, v40, 32
-; MUBUF-NEXT:    v_readlane_b32 s67, v40, 31
-; MUBUF-NEXT:    v_readlane_b32 s66, v40, 30
-; MUBUF-NEXT:    v_readlane_b32 s65, v40, 29
-; MUBUF-NEXT:    v_readlane_b32 s64, v40, 28
-; MUBUF-NEXT:    v_readlane_b32 s63, v40, 27
-; MUBUF-NEXT:    v_readlane_b32 s62, v40, 26
-; MUBUF-NEXT:    v_readlane_b32 s61, v40, 25
-; MUBUF-NEXT:    v_readlane_b32 s60, v40, 24
-; MUBUF-NEXT:    v_readlane_b32 s59, v40, 23
-; MUBUF-NEXT:    v_readlane_b32 s58, v40, 22
-; MUBUF-NEXT:    v_readlane_b32 s57, v40, 21
-; MUBUF-NEXT:    v_readlane_b32 s56, v40, 20
-; MUBUF-NEXT:    v_readlane_b32 s55, v40, 19
-; MUBUF-NEXT:    v_readlane_b32 s54, v40, 18
-; MUBUF-NEXT:    v_readlane_b32 s53, v40, 17
-; MUBUF-NEXT:    v_readlane_b32 s52, v40, 16
-; MUBUF-NEXT:    v_readlane_b32 s51, v40, 15
-; MUBUF-NEXT:    v_readlane_b32 s50, v40, 14
-; MUBUF-NEXT:    v_readlane_b32 s49, v40, 13
-; MUBUF-NEXT:    v_readlane_b32 s48, v40, 12
-; MUBUF-NEXT:    v_readlane_b32 s47, v40, 11
-; MUBUF-NEXT:    v_readlane_b32 s46, v40, 10
-; MUBUF-NEXT:    v_readlane_b32 s45, v40, 9
-; MUBUF-NEXT:    v_readlane_b32 s44, v40, 8
-; MUBUF-NEXT:    v_readlane_b32 s43, v40, 7
-; MUBUF-NEXT:    v_readlane_b32 s42, v40, 6
-; MUBUF-NEXT:    v_readlane_b32 s41, v40, 5
-; MUBUF-NEXT:    v_readlane_b32 s40, v40, 4
+; MUBUF-NEXT:    v_readlane_b32 s83, v40, 23
+; MUBUF-NEXT:    v_readlane_b32 s82, v40, 22
+; MUBUF-NEXT:    v_readlane_b32 s81, v40, 21
+; MUBUF-NEXT:    v_readlane_b32 s80, v40, 20
+; MUBUF-NEXT:    v_readlane_b32 s71, v40, 19
+; MUBUF-NEXT:    v_readlane_b32 s70, v40, 18
+; MUBUF-NEXT:    v_readlane_b32 s69, v40, 17
+; MUBUF-NEXT:    v_readlane_b32 s68, v40, 16
+; MUBUF-NEXT:    v_readlane_b32 s67, v40, 15
+; MUBUF-NEXT:    v_readlane_b32 s66, v40, 14
+; MUBUF-NEXT:    v_readlane_b32 s65, v40, 13
+; MUBUF-NEXT:    v_readlane_b32 s64, v40, 12
+; MUBUF-NEXT:    v_readlane_b32 s55, v40, 11
+; MUBUF-NEXT:    v_readlane_b32 s54, v40, 10
+; MUBUF-NEXT:    v_readlane_b32 s53, v40, 9
+; MUBUF-NEXT:    v_readlane_b32 s52, v40, 8
+; MUBUF-NEXT:    v_readlane_b32 s51, v40, 7
+; MUBUF-NEXT:    v_readlane_b32 s50, v40, 6
+; MUBUF-NEXT:    v_readlane_b32 s49, v40, 5
+; MUBUF-NEXT:    v_readlane_b32 s48, v40, 4
 ; MUBUF-NEXT:    v_readlane_b32 s39, v40, 3
 ; MUBUF-NEXT:    v_readlane_b32 s38, v40, 2
 ; MUBUF-NEXT:    v_readlane_b32 s37, v40, 1
@@ -409,48 +361,22 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
-; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 5
-; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 6
-; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 7
-; FLATSCR-NEXT:    v_writelane_b32 v40, s40, 8
-; FLATSCR-NEXT:    v_writelane_b32 v40, s41, 9
-; FLATSCR-NEXT:    v_writelane_b32 v40, s42, 10
-; FLATSCR-NEXT:    v_writelane_b32 v40, s43, 11
-; FLATSCR-NEXT:    v_writelane_b32 v40, s44, 12
-; FLATSCR-NEXT:    v_writelane_b32 v40, s45, 13
-; FLATSCR-NEXT:    v_writelane_b32 v40, s46, 14
-; FLATSCR-NEXT:    v_writelane_b32 v40, s47, 15
-; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 17
-; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 18
-; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 19
-; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 20
-; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 21
-; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 22
-; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 23
-; FLATSCR-NEXT:    v_writelane_b32 v40, s56, 24
-; FLATSCR-NEXT:    v_writelane_b32 v40, s57, 25
-; FLATSCR-NEXT:    v_writelane_b32 v40, s58, 26
-; FLATSCR-NEXT:    v_writelane_b32 v40, s59, 27
-; FLATSCR-NEXT:    v_writelane_b32 v40, s60, 28
-; FLATSCR-NEXT:    v_writelane_b32 v40, s61, 29
-; FLATSCR-NEXT:    v_writelane_b32 v40, s62, 30
-; FLATSCR-NEXT:    v_writelane_b32 v40, s63, 31
-; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 32
-; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 33
-; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 34
-; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 35
-; FLATSCR-NEXT:    v_writelane_b32 v40, s68, 36
-; FLATSCR-NEXT:    v_writelane_b32 v40, s69, 37
-; FLATSCR-NEXT:    v_writelane_b32 v40, s70, 38
-; FLATSCR-NEXT:    v_writelane_b32 v40, s71, 39
-; FLATSCR-NEXT:    v_writelane_b32 v40, s72, 40
-; FLATSCR-NEXT:    v_writelane_b32 v40, s73, 41
-; FLATSCR-NEXT:    v_writelane_b32 v40, s74, 42
-; FLATSCR-NEXT:    v_writelane_b32 v40, s75, 43
+; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 4
+; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 5
+; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 6
+; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 7
+; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 9
+; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 10
+; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 11
+; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 13
+; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 14
+; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 15
+; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 17
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
@@ -474,10 +400,10 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    ; def s[0:15]
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; def s[68:75]
+; FLATSCR-NEXT:    ; def s[72:79]
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; def s[34:35]
+; FLATSCR-NEXT:    ; def s[88:89]
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[52:67]
@@ -489,56 +415,30 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    ; use s[16:31]
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; use s[68:75]
+; FLATSCR-NEXT:    ; use s[72:79]
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; use s[34:35]
+; FLATSCR-NEXT:    ; use s[88:89]
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[0:15]
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s75, v40, 43
-; FLATSCR-NEXT:    v_readlane_b32 s74, v40, 42
-; FLATSCR-NEXT:    v_readlane_b32 s73, v40, 41
-; FLATSCR-NEXT:    v_readlane_b32 s72, v40, 40
-; FLATSCR-NEXT:    v_readlane_b32 s71, v40, 39
-; FLATSCR-NEXT:    v_readlane_b32 s70, v40, 38
-; FLATSCR-NEXT:    v_readlane_b32 s69, v40, 37
-; FLATSCR-NEXT:    v_readlane_b32 s68, v40, 36
-; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 35
-; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 34
-; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 33
-; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 32
-; FLATSCR-NEXT:    v_readlane_b32 s63, v40, 31
-; FLATSCR-NEXT:    v_readlane_b32 s62, v40, 30
-; FLATSCR-NEXT:    v_readlane_b32 s61, v40, 29
-; FLATSCR-NEXT:    v_readlane_b32 s60, v40, 28
-; FLATSCR-NEXT:    v_readlane_b32 s59, v40, 27
-; FLATSCR-NEXT:    v_readlane_b32 s58, v40, 26
-; FLATSCR-NEXT:    v_readlane_b32 s57, v40, 25
-; FLATSCR-NEXT:    v_readlane_b32 s56, v40, 24
-; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 23
-; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 22
-; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 21
-; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 20
-; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 19
-; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 18
-; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 17
-; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 16
-; FLATSCR-NEXT:    v_readlane_b32 s47, v40, 15
-; FLATSCR-NEXT:    v_readlane_b32 s46, v40, 14
-; FLATSCR-NEXT:    v_readlane_b32 s45, v40, 13
-; FLATSCR-NEXT:    v_readlane_b32 s44, v40, 12
-; FLATSCR-NEXT:    v_readlane_b32 s43, v40, 11
-; FLATSCR-NEXT:    v_readlane_b32 s42, v40, 10
-; FLATSCR-NEXT:    v_readlane_b32 s41, v40, 9
-; FLATSCR-NEXT:    v_readlane_b32 s40, v40, 8
-; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 7
-; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 6
-; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 5
-; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 4
-; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 17
+; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 16
+; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 15
+; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 14
+; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 13
+; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 12
+; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 11
+; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 10
+; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 9
+; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 8
+; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 7
+; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 6
+; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 5
+; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 4
+; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 2
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -571,39 +471,13 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; Has no spilled CSR VGPRs used for SGPR spilling, so no need to
 ; enable all lanes and restore.
 define void @spill_only_csr_sgpr() {
-; MUBUF-LABEL: spill_only_csr_sgpr:
-; MUBUF:       ; %bb.0:
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
-; MUBUF-NEXT:    v_writelane_b32 v0, s42, 0
-; MUBUF-NEXT:    ;;#ASMSTART
-; MUBUF-NEXT:    ; clobber s42
-; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s42, v0, 0
-; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    s_setpc_b64 s[30:31]
-;
-; FLATSCR-LABEL: spill_only_csr_sgpr:
-; FLATSCR:       ; %bb.0:
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
-; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT:    v_writelane_b32 v0, s42, 0
-; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; clobber s42
-; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s42, v0, 0
-; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
-; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: spill_only_csr_sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; clobber s42
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber s42", "~{s42}"()
   ret void
 }
@@ -663,143 +537,79 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
-; MUBUF-NEXT:    v_writelane_b32 v1, s40, 0
-; MUBUF-NEXT:    v_writelane_b32 v1, s41, 1
-; MUBUF-NEXT:    v_writelane_b32 v1, s42, 2
-; MUBUF-NEXT:    v_writelane_b32 v1, s43, 3
-; MUBUF-NEXT:    v_writelane_b32 v1, s44, 4
-; MUBUF-NEXT:    v_writelane_b32 v1, s45, 5
-; MUBUF-NEXT:    v_writelane_b32 v1, s46, 6
-; MUBUF-NEXT:    v_writelane_b32 v1, s47, 7
-; MUBUF-NEXT:    v_writelane_b32 v1, s48, 8
-; MUBUF-NEXT:    v_writelane_b32 v1, s49, 9
-; MUBUF-NEXT:    v_writelane_b32 v1, s50, 10
-; MUBUF-NEXT:    v_writelane_b32 v1, s51, 11
-; MUBUF-NEXT:    v_writelane_b32 v1, s52, 12
-; MUBUF-NEXT:    v_writelane_b32 v1, s53, 13
-; MUBUF-NEXT:    v_writelane_b32 v1, s54, 14
-; MUBUF-NEXT:    v_writelane_b32 v1, s55, 15
-; MUBUF-NEXT:    v_writelane_b32 v1, s56, 16
-; MUBUF-NEXT:    v_writelane_b32 v1, s57, 17
-; MUBUF-NEXT:    v_writelane_b32 v1, s58, 18
-; MUBUF-NEXT:    v_writelane_b32 v1, s59, 19
-; MUBUF-NEXT:    v_writelane_b32 v1, s60, 20
-; MUBUF-NEXT:    v_writelane_b32 v1, s61, 21
-; MUBUF-NEXT:    v_writelane_b32 v1, s62, 22
-; MUBUF-NEXT:    v_writelane_b32 v1, s63, 23
-; MUBUF-NEXT:    v_writelane_b32 v1, s64, 24
-; MUBUF-NEXT:    v_writelane_b32 v1, s65, 25
-; MUBUF-NEXT:    v_writelane_b32 v1, s66, 26
-; MUBUF-NEXT:    v_writelane_b32 v1, s67, 27
-; MUBUF-NEXT:    v_writelane_b32 v1, s68, 28
-; MUBUF-NEXT:    v_writelane_b32 v1, s69, 29
-; MUBUF-NEXT:    v_writelane_b32 v1, s70, 30
-; MUBUF-NEXT:    v_writelane_b32 v1, s71, 31
-; MUBUF-NEXT:    v_writelane_b32 v1, s72, 32
-; MUBUF-NEXT:    v_writelane_b32 v1, s73, 33
-; MUBUF-NEXT:    v_writelane_b32 v1, s74, 34
-; MUBUF-NEXT:    v_writelane_b32 v1, s75, 35
-; MUBUF-NEXT:    v_writelane_b32 v1, s76, 36
-; MUBUF-NEXT:    v_writelane_b32 v1, s77, 37
-; MUBUF-NEXT:    v_writelane_b32 v1, s78, 38
-; MUBUF-NEXT:    v_writelane_b32 v1, s79, 39
-; MUBUF-NEXT:    v_writelane_b32 v1, s80, 40
-; MUBUF-NEXT:    v_writelane_b32 v1, s81, 41
-; MUBUF-NEXT:    v_writelane_b32 v1, s82, 42
-; MUBUF-NEXT:    v_writelane_b32 v1, s83, 43
-; MUBUF-NEXT:    v_writelane_b32 v1, s84, 44
-; MUBUF-NEXT:    v_writelane_b32 v1, s85, 45
-; MUBUF-NEXT:    v_writelane_b32 v1, s86, 46
-; MUBUF-NEXT:    v_writelane_b32 v1, s87, 47
-; MUBUF-NEXT:    v_writelane_b32 v1, s88, 48
-; MUBUF-NEXT:    v_writelane_b32 v1, s89, 49
-; MUBUF-NEXT:    v_writelane_b32 v1, s90, 50
-; MUBUF-NEXT:    v_writelane_b32 v1, s91, 51
-; MUBUF-NEXT:    v_writelane_b32 v1, s92, 52
-; MUBUF-NEXT:    v_writelane_b32 v1, s93, 53
-; MUBUF-NEXT:    v_writelane_b32 v1, s94, 54
-; MUBUF-NEXT:    v_writelane_b32 v1, s95, 55
-; MUBUF-NEXT:    v_writelane_b32 v1, s96, 56
-; MUBUF-NEXT:    v_writelane_b32 v1, s97, 57
-; MUBUF-NEXT:    v_writelane_b32 v1, s98, 58
-; MUBUF-NEXT:    v_writelane_b32 v1, s99, 59
-; MUBUF-NEXT:    v_writelane_b32 v1, s100, 60
+; MUBUF-NEXT:    v_writelane_b32 v1, s48, 0
+; MUBUF-NEXT:    v_writelane_b32 v1, s49, 1
+; MUBUF-NEXT:    v_writelane_b32 v1, s50, 2
+; MUBUF-NEXT:    v_writelane_b32 v1, s51, 3
+; MUBUF-NEXT:    v_writelane_b32 v1, s52, 4
+; MUBUF-NEXT:    v_writelane_b32 v1, s53, 5
+; MUBUF-NEXT:    v_writelane_b32 v1, s54, 6
+; MUBUF-NEXT:    v_writelane_b32 v1, s55, 7
+; MUBUF-NEXT:    v_writelane_b32 v1, s64, 8
+; MUBUF-NEXT:    v_writelane_b32 v1, s65, 9
+; MUBUF-NEXT:    v_writelane_b32 v1, s66, 10
+; MUBUF-NEXT:    v_writelane_b32 v1, s67, 11
+; MUBUF-NEXT:    v_writelane_b32 v1, s68, 12
+; MUBUF-NEXT:    v_writelane_b32 v1, s69, 13
+; MUBUF-NEXT:    v_writelane_b32 v1, s70, 14
+; MUBUF-NEXT:    v_writelane_b32 v1, s71, 15
+; MUBUF-NEXT:    v_writelane_b32 v1, s80, 16
+; MUBUF-NEXT:    v_writelane_b32 v1, s81, 17
+; MUBUF-NEXT:    v_writelane_b32 v1, s82, 18
+; MUBUF-NEXT:    v_writelane_b32 v1, s83, 19
+; MUBUF-NEXT:    v_writelane_b32 v1, s84, 20
+; MUBUF-NEXT:    v_writelane_b32 v1, s85, 21
+; MUBUF-NEXT:    v_writelane_b32 v1, s86, 22
+; MUBUF-NEXT:    v_writelane_b32 v1, s87, 23
+; MUBUF-NEXT:    v_writelane_b32 v1, s96, 24
+; MUBUF-NEXT:    v_writelane_b32 v1, s97, 25
+; MUBUF-NEXT:    v_writelane_b32 v1, s98, 26
+; MUBUF-NEXT:    v_writelane_b32 v1, s99, 27
+; MUBUF-NEXT:    v_writelane_b32 v1, s100, 28
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT:    v_writelane_b32 v1, s101, 61
+; MUBUF-NEXT:    v_writelane_b32 v1, s101, 29
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber v41
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_writelane_b32 v1, s102, 62
+; MUBUF-NEXT:    v_writelane_b32 v1, s102, 30
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_readlane_b32 s102, v1, 62
-; MUBUF-NEXT:    v_readlane_b32 s101, v1, 61
-; MUBUF-NEXT:    v_readlane_b32 s100, v1, 60
-; MUBUF-NEXT:    v_readlane_b32 s99, v1, 59
-; MUBUF-NEXT:    v_readlane_b32 s98, v1, 58
-; MUBUF-NEXT:    v_readlane_b32 s97, v1, 57
-; MUBUF-NEXT:    v_readlane_b32 s96, v1, 56
-; MUBUF-NEXT:    v_readlane_b32 s95, v1, 55
-; MUBUF-NEXT:    v_readlane_b32 s94, v1, 54
-; MUBUF-NEXT:    v_readlane_b32 s93, v1, 53
-; MUBUF-NEXT:    v_readlane_b32 s92, v1, 52
-; MUBUF-NEXT:    v_readlane_b32 s91, v1, 51
-; MUBUF-NEXT:    v_readlane_b32 s90, v1, 50
-; MUBUF-NEXT:    v_readlane_b32 s89, v1, 49
-; MUBUF-NEXT:    v_readlane_b32 s88, v1, 48
-; MUBUF-NEXT:    v_readlane_b32 s87, v1, 47
-; MUBUF-NEXT:    v_readlane_b32 s86, v1, 46
-; MUBUF-NEXT:    v_readlane_b32 s85, v1, 45
-; MUBUF-NEXT:    v_readlane_b32 s84, v1, 44
-; MUBUF-NEXT:    v_readlane_b32 s83, v1, 43
-; MUBUF-NEXT:    v_readlane_b32 s82, v1, 42
-; MUBUF-NEXT:    v_readlane_b32 s81, v1, 41
-; MUBUF-NEXT:    v_readlane_b32 s80, v1, 40
-; MUBUF-NEXT:    v_readlane_b32 s79, v1, 39
-; MUBUF-NEXT:    v_readlane_b32 s78, v1, 38
-; MUBUF-NEXT:    v_readlane_b32 s77, v1, 37
-; MUBUF-NEXT:    v_readlane_b32 s76, v1, 36
-; MUBUF-NEXT:    v_readlane_b32 s75, v1, 35
-; MUBUF-NEXT:    v_readlane_b32 s74, v1, 34
-; MUBUF-NEXT:    v_readlane_b32 s73, v1, 33
-; MUBUF-NEXT:    v_readlane_b32 s72, v1, 32
-; MUBUF-NEXT:    v_readlane_b32 s71, v1, 31
-; MUBUF-NEXT:    v_readlane_b32 s70, v1, 30
-; MUBUF-NEXT:    v_readlane_b32 s69, v1, 29
-; MUBUF-NEXT:    v_readlane_b32 s68, v1, 28
-; MUBUF-NEXT:    v_readlane_b32 s67, v1, 27
-; MUBUF-NEXT:    v_readlane_b32 s66, v1, 26
-; MUBUF-NEXT:    v_readlane_b32 s65, v1, 25
-; MUBUF-NEXT:    v_readlane_b32 s64, v1, 24
-; MUBUF-NEXT:    v_readlane_b32 s63, v1, 23
-; MUBUF-NEXT:    v_readlane_b32 s62, v1, 22
-; MUBUF-NEXT:    v_readlane_b32 s61, v1, 21
-; MUBUF-NEXT:    v_readlane_b32 s60, v1, 20
-; MUBUF-NEXT:    v_readlane_b32 s59, v1, 19
-; MUBUF-NEXT:    v_readlane_b32 s58, v1, 18
-; MUBUF-NEXT:    v_readlane_b32 s57, v1, 17
-; MUBUF-NEXT:    v_readlane_b32 s56, v1, 16
-; MUBUF-NEXT:    v_readlane_b32 s55, v1, 15
-; MUBUF-NEXT:    v_readlane_b32 s54, v1, 14
-; MUBUF-NEXT:    v_readlane_b32 s53, v1, 13
-; MUBUF-NEXT:    v_readlane_b32 s52, v1, 12
-; MUBUF-NEXT:    v_readlane_b32 s51, v1, 11
-; MUBUF-NEXT:    v_readlane_b32 s50, v1, 10
-; MUBUF-NEXT:    v_readlane_b32 s49, v1, 9
-; MUBUF-NEXT:    v_readlane_b32 s48, v1, 8
-; MUBUF-NEXT:    v_readlane_b32 s47, v1, 7
-; MUBUF-NEXT:    v_readlane_b32 s46, v1, 6
-; MUBUF-NEXT:    v_readlane_b32 s45, v1, 5
-; MUBUF-NEXT:    v_readlane_b32 s44, v1, 4
-; MUBUF-NEXT:    v_readlane_b32 s43, v1, 3
-; MUBUF-NEXT:    v_readlane_b32 s42, v1, 2
-; MUBUF-NEXT:    v_readlane_b32 s41, v1, 1
-; MUBUF-NEXT:    v_readlane_b32 s40, v1, 0
+; MUBUF-NEXT:    v_readlane_b32 s102, v1, 30
+; MUBUF-NEXT:    v_readlane_b32 s101, v1, 29
+; MUBUF-NEXT:    v_readlane_b32 s100, v1, 28
+; MUBUF-NEXT:    v_readlane_b32 s99, v1, 27
+; MUBUF-NEXT:    v_readlane_b32 s98, v1, 26
+; MUBUF-NEXT:    v_readlane_b32 s97, v1, 25
+; MUBUF-NEXT:    v_readlane_b32 s96, v1, 24
+; MUBUF-NEXT:    v_readlane_b32 s87, v1, 23
+; MUBUF-NEXT:    v_readlane_b32 s86, v1, 22
+; MUBUF-NEXT:    v_readlane_b32 s85, v1, 21
+; MUBUF-NEXT:    v_readlane_b32 s84, v1, 20
+; MUBUF-NEXT:    v_readlane_b32 s83, v1, 19
+; MUBUF-NEXT:    v_readlane_b32 s82, v1, 18
+; MUBUF-NEXT:    v_readlane_b32 s81, v1, 17
+; MUBUF-NEXT:    v_readlane_b32 s80, v1, 16
+; MUBUF-NEXT:    v_readlane_b32 s71, v1, 15
+; MUBUF-NEXT:    v_readlane_b32 s70, v1, 14
+; MUBUF-NEXT:    v_readlane_b32 s69, v1, 13
+; MUBUF-NEXT:    v_readlane_b32 s68, v1, 12
+; MUBUF-NEXT:    v_readlane_b32 s67, v1, 11
+; MUBUF-NEXT:    v_readlane_b32 s66, v1, 10
+; MUBUF-NEXT:    v_readlane_b32 s65, v1, 9
+; MUBUF-NEXT:    v_readlane_b32 s64, v1, 8
+; MUBUF-NEXT:    v_readlane_b32 s55, v1, 7
+; MUBUF-NEXT:    v_readlane_b32 s54, v1, 6
+; MUBUF-NEXT:    v_readlane_b32 s53, v1, 5
+; MUBUF-NEXT:    v_readlane_b32 s52, v1, 4
+; MUBUF-NEXT:    v_readlane_b32 s51, v1, 3
+; MUBUF-NEXT:    v_readlane_b32 s50, v1, 2
+; MUBUF-NEXT:    v_readlane_b32 s49, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s48, v1, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -816,143 +626,79 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
-; FLATSCR-NEXT:    v_writelane_b32 v1, s40, 0
-; FLATSCR-NEXT:    v_writelane_b32 v1, s41, 1
-; FLATSCR-NEXT:    v_writelane_b32 v1, s42, 2
-; FLATSCR-NEXT:    v_writelane_b32 v1, s43, 3
-; FLATSCR-NEXT:    v_writelane_b32 v1, s44, 4
-; FLATSCR-NEXT:    v_writelane_b32 v1, s45, 5
-; FLATSCR-NEXT:    v_writelane_b32 v1, s46, 6
-; FLATSCR-NEXT:    v_writelane_b32 v1, s47, 7
-; FLATSCR-NEXT:    v_writelane_b32 v1, s48, 8
-; FLATSCR-NEXT:    v_writelane_b32 v1, s49, 9
-; FLATSCR-NEXT:    v_writelane_b32 v1, s50, 10
-; FLATSCR-NEXT:    v_writelane_b32 v1, s51, 11
-; FLATSCR-NEXT:    v_writelane_b32 v1, s52, 12
-; FLATSCR-NEXT:    v_writelane_b32 v1, s53, 13
-; FLATSCR-NEXT:    v_writelane_b32 v1, s54, 14
-; FLATSCR-NEXT:    v_writelane_b32 v1, s55, 15
-; FLATSCR-NEXT:    v_writelane_b32 v1, s56, 16
-; FLATSCR-NEXT:    v_writelane_b32 v1, s57, 17
-; FLATSCR-NEXT:    v_writelane_b32 v1, s58, 18
-; FLATSCR-NEXT:    v_writelane_b32 v1, s59, 19
-; FLATSCR-NEXT:    v_writelane_b32 v1, s60, 20
-; FLATSCR-NEXT:    v_writelane_b32 v1, s61, 21
-; FLATSCR-NEXT:    v_writelane_b32 v1, s62, 22
-; FLATSCR-NEXT:    v_writelane_b32 v1, s63, 23
-; FLATSCR-NEXT:    v_writelane_b32 v1, s64, 24
-; FLATSCR-NEXT:    v_writelane_b32 v1, s65, 25
-; FLATSCR-NEXT:    v_writelane_b32 v1, s66, 26
-; FLATSCR-NEXT:    v_writelane_b32 v1, s67, 27
-; FLATSCR-NEXT:    v_writelane_b32 v1, s68, 28
-; FLATSCR-NEXT:    v_writelane_b32 v1, s69, 29
-; FLATSCR-NEXT:    v_writelane_b32 v1, s70, 30
-; FLATSCR-NEXT:    v_writelane_b32 v1, s71, 31
-; FLATSCR-NEXT:    v_writelane_b32 v1, s72, 32
-; FLATSCR-NEXT:    v_writelane_b32 v1, s73, 33
-; FLATSCR-NEXT:    v_writelane_b32 v1, s74, 34
-; FLATSCR-NEXT:    v_writelane_b32 v1, s75, 35
-; FLATSCR-NEXT:    v_writelane_b32 v1, s76, 36
-; FLATSCR-NEXT:    v_writelane_b32 v1, s77, 37
-; FLATSCR-NEXT:    v_writelane_b32 v1, s78, 38
-; FLATSCR-NEXT:    v_writelane_b32 v1, s79, 39
-; FLATSCR-NEXT:    v_writelane_b32 v1, s80, 40
-; FLATSCR-NEXT:    v_writelane_b32 v1, s81, 41
-; FLATSCR-NEXT:    v_writelane_b32 v1, s82, 42
-; FLATSCR-NEXT:    v_writelane_b32 v1, s83, 43
-; FLATSCR-NEXT:    v_writelane_b32 v1, s84, 44
-; FLATSCR-NEXT:    v_writelane_b32 v1, s85, 45
-; FLATSCR-NEXT:    v_writelane_b32 v1, s86, 46
-; FLATSCR-NEXT:    v_writelane_b32 v1, s87, 47
-; FLATSCR-NEXT:    v_writelane_b32 v1, s88, 48
-; FLATSCR-NEXT:    v_writelane_b32 v1, s89, 49
-; FLATSCR-NEXT:    v_writelane_b32 v1, s90, 50
-; FLATSCR-NEXT:    v_writelane_b32 v1, s91, 51
-; FLATSCR-NEXT:    v_writelane_b32 v1, s92, 52
-; FLATSCR-NEXT:    v_writelane_b32 v1, s93, 53
-; FLATSCR-NEXT:    v_writelane_b32 v1, s94, 54
-; FLATSCR-NEXT:    v_writelane_b32 v1, s95, 55
-; FLATSCR-NEXT:    v_writelane_b32 v1, s96, 56
-; FLATSCR-NEXT:    v_writelane_b32 v1, s97, 57
-; FLATSCR-NEXT:    v_writelane_b32 v1, s98, 58
-; FLATSCR-NEXT:    v_writelane_b32 v1, s99, 59
-; FLATSCR-NEXT:    v_writelane_b32 v1, s100, 60
+; FLATSCR-NEXT:    v_writelane_b32 v1, s48, 0
+; FLATSCR-NEXT:    v_writelane_b32 v1, s49, 1
+; FLATSCR-NEXT:    v_writelane_b32 v1, s50, 2
+; FLATSCR-NEXT:    v_writelane_b32 v1, s51, 3
+; FLATSCR-NEXT:    v_writelane_b32 v1, s52, 4
+; FLATSCR-NEXT:    v_writelane_b32 v1, s53, 5
+; FLATSCR-NEXT:    v_writelane_b32 v1, s54, 6
+; FLATSCR-NEXT:    v_writelane_b32 v1, s55, 7
+; FLATSCR-NEXT:    v_writelane_b32 v1, s64, 8
+; FLATSCR-NEXT:    v_writelane_b32 v1, s65, 9
+; FLATSCR-NEXT:    v_writelane_b32 v1, s66, 10
+; FLATSCR-NEXT:    v_writelane_b32 v1, s67, 11
+; FLATSCR-NEXT:    v_writelane_b32 v1, s68, 12
+; FLATSCR-NEXT:    v_writelane_b32 v1, s69, 13
+; FLATSCR-NEXT:    v_writelane_b32 v1, s70, 14
+; FLATSCR-NEXT:    v_writelane_b32 v1, s71, 15
+; FLATSCR-NEXT:    v_writelane_b32 v1, s80, 16
+; FLATSCR-NEXT:    v_writelane_b32 v1, s81, 17
+; FLATSCR-NEXT:    v_writelane_b32 v1, s82, 18
+; FLATSCR-NEXT:    v_writelane_b32 v1, s83, 19
+; FLATSCR-NEXT:    v_writelane_b32 v1, s84, 20
+; FLATSCR-NEXT:    v_writelane_b32 v1, s85, 21
+; FLATSCR-NEXT:    v_writelane_b32 v1, s86, 22
+; FLATSCR-NEXT:    v_writelane_b32 v1, s87, 23
+; FLATSCR-NEXT:    v_writelane_b32 v1, s96, 24
+; FLATSCR-NEXT:    v_writelane_b32 v1, s97, 25
+; FLATSCR-NEXT:    v_writelane_b32 v1, s98, 26
+; FLATSCR-NEXT:    v_writelane_b32 v1, s99, 27
+; FLATSCR-NEXT:    v_writelane_b32 v1, s100, 28
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT:    v_writelane_b32 v1, s101, 61
+; FLATSCR-NEXT:    v_writelane_b32 v1, s101, 29
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber v41
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_writelane_b32 v1, s102, 62
+; FLATSCR-NEXT:    v_writelane_b32 v1, s102, 30
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_readlane_b32 s102, v1, 62
-; FLATSCR-NEXT:    v_readlane_b32 s101, v1, 61
-; FLATSCR-NEXT:    v_readlane_b32 s100, v1, 60
-; FLATSCR-NEXT:    v_readlane_b32 s99, v1, 59
-; FLATSCR-NEXT:    v_readlane_b32 s98, v1, 58
-; FLATSCR-NEXT:    v_readlane_b32 s97, v1, 57
-; FLATSCR-NEXT:    v_readlane_b32 s96, v1, 56
-; FLATSCR-NEXT:    v_readlane_b32 s95, v1, 55
-; FLATSCR-NEXT:    v_readlane_b32 s94, v1, 54
-; FLATSCR-NEXT:    v_readlane_b32 s93, v1, 53
-; FLATSCR-NEXT:    v_readlane_b32 s92, v1, 52
-; FLATSCR-NEXT:    v_readlane_b32 s91, v1, 51
-; FLATSCR-NEXT:    v_readlane_b32 s90, v1, 50
-; FLATSCR-NEXT:    v_readlane_b32 s89, v1, 49
-; FLATSCR-NEXT:    v_readlane_b32 s88, v1, 48
-; FLATSCR-NEXT:    v_readlane_b32 s87, v1, 47
-; FLATSCR-NEXT:    v_readlane_b32 s86, v1, 46
-; FLATSCR-NEXT:    v_readlane_b32 s85, v1, 45
-; FLATSCR-NEXT:    v_readlane_b32 s84, v1, 44
-; FLATSCR-NEXT:    v_readlane_b32 s83, v1, 43
-; FLATSCR-NEXT:    v_readlane_b32 s82, v1, 42
-; FLATSCR-NEXT:    v_readlane_b32 s81, v1, 41
-; FLATSCR-NEXT:    v_readlane_b32 s80, v1, 40
-; FLATSCR-NEXT:    v_readlane_b32 s79, v1, 39
-; FLATSCR-NEXT:    v_readlane_b32 s78, v1, 38
-; FLATSCR-NEXT:    v_readlane_b32 s77, v1, 37
-; FLATSCR-NEXT:    v_readlane_b32 s76, v1, 36
-; FLATSCR-NEXT:    v_readlane_b32 s75, v1, 35
-; FLATSCR-NEXT:    v_readlane_b32 s74, v1, 34
-; FLATSCR-NEXT:    v_readlane_b32 s73, v1, 33
-; FLATSCR-NEXT:    v_readlane_b32 s72, v1, 32
-; FLATSCR-NEXT:    v_readlane_b32 s71, v1, 31
-; FLATSCR-NEXT:    v_readlane_b32 s70, v1, 30
-; FLATSCR-NEXT:    v_readlane_b32 s69, v1, 29
-; FLATSCR-NEXT:    v_readlane_b32 s68, v1, 28
-; FLATSCR-NEXT:    v_readlane_b32 s67, v1, 27
-; FLATSCR-NEXT:    v_readlane_b32 s66, v1, 26
-; FLATSCR-NEXT:    v_readlane_b32 s65, v1, 25
-; FLATSCR-NEXT:    v_readlane_b32 s64, v1, 24
-; FLATSCR-NEXT:    v_readlane_b32 s63, v1, 23
-; FLATSCR-NEXT:    v_readlane_b32 s62, v1, 22
-; FLATSCR-NEXT:    v_readlane_b32 s61, v1, 21
-; FLATSCR-NEXT:    v_readlane_b32 s60, v1, 20
-; FLATSCR-NEXT:    v_readlane_b32 s59, v1, 19
-; FLATSCR-NEXT:    v_readlane_b32 s58, v1, 18
-; FLATSCR-NEXT:    v_readlane_b32 s57, v1, 17
-; FLATSCR-NEXT:    v_readlane_b32 s56, v1, 16
-; FLATSCR-NEXT:    v_readlane_b32 s55, v1, 15
-; FLATSCR-NEXT:    v_readlane_b32 s54, v1, 14
-; FLATSCR-NEXT:    v_readlane_b32 s53, v1, 13
-; FLATSCR-NEXT:    v_readlane_b32 s52, v1, 12
-; FLATSCR-NEXT:    v_readlane_b32 s51, v1, 11
-; FLATSCR-NEXT:    v_readlane_b32 s50, v1, 10
-; FLATSCR-NEXT:    v_readlane_b32 s49, v1, 9
-; FLATSCR-NEXT:    v_readlane_b32 s48, v1, 8
-; FLATSCR-NEXT:    v_readlane_b32 s47, v1, 7
-; FLATSCR-NEXT:    v_readlane_b32 s46, v1, 6
-; FLATSCR-NEXT:    v_readlane_b32 s45, v1, 5
-; FLATSCR-NEXT:    v_readlane_b32 s44, v1, 4
-; FLATSCR-NEXT:    v_readlane_b32 s43, v1, 3
-; FLATSCR-NEXT:    v_readlane_b32 s42, v1, 2
-; FLATSCR-NEXT:    v_readlane_b32 s41, v1, 1
-; FLATSCR-NEXT:    v_readlane_b32 s40, v1, 0
+; FLATSCR-NEXT:    v_readlane_b32 s102, v1, 30
+; FLATSCR-NEXT:    v_readlane_b32 s101, v1, 29
+; FLATSCR-NEXT:    v_readlane_b32 s100, v1, 28
+; FLATSCR-NEXT:    v_readlane_b32 s99, v1, 27
+; FLATSCR-NEXT:    v_readlane_b32 s98, v1, 26
+; FLATSCR-NEXT:    v_readlane_b32 s97, v1, 25
+; FLATSCR-NEXT:    v_readlane_b32 s96, v1, 24
+; FLATSCR-NEXT:    v_readlane_b32 s87, v1, 23
+; FLATSCR-NEXT:    v_readlane_b32 s86, v1, 22
+; FLATSCR-NEXT:    v_readlane_b32 s85, v1, 21
+; FLATSCR-NEXT:    v_readlane_b32 s84, v1, 20
+; FLATSCR-NEXT:    v_readlane_b32 s83, v1, 19
+; FLATSCR-NEXT:    v_readlane_b32 s82, v1, 18
+; FLATSCR-NEXT:    v_readlane_b32 s81, v1, 17
+; FLATSCR-NEXT:    v_readlane_b32 s80, v1, 16
+; FLATSCR-NEXT:    v_readlane_b32 s71, v1, 15
+; FLATSCR-NEXT:    v_readlane_b32 s70, v1, 14
+; FLATSCR-NEXT:    v_readlane_b32 s69, v1, 13
+; FLATSCR-NEXT:    v_readlane_b32 s68, v1, 12
+; FLATSCR-NEXT:    v_readlane_b32 s67, v1, 11
+; FLATSCR-NEXT:    v_readlane_b32 s66, v1, 10
+; FLATSCR-NEXT:    v_readlane_b32 s65, v1, 9
+; FLATSCR-NEXT:    v_readlane_b32 s64, v1, 8
+; FLATSCR-NEXT:    v_readlane_b32 s55, v1, 7
+; FLATSCR-NEXT:    v_readlane_b32 s54, v1, 6
+; FLATSCR-NEXT:    v_readlane_b32 s53, v1, 5
+; FLATSCR-NEXT:    v_readlane_b32 s52, v1, 4
+; FLATSCR-NEXT:    v_readlane_b32 s51, v1, 3
+; FLATSCR-NEXT:    v_readlane_b32 s50, v1, 2
+; FLATSCR-NEXT:    v_readlane_b32 s49, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s48, v1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload
@@ -986,143 +732,79 @@ define void @no_new_vgpr_for_fp_csr() #1 {
 ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v1, s39, 0
-; MUBUF-NEXT:    v_writelane_b32 v1, s40, 1
-; MUBUF-NEXT:    v_writelane_b32 v1, s41, 2
-; MUBUF-NEXT:    v_writelane_b32 v1, s42, 3
-; MUBUF-NEXT:    v_writelane_b32 v1, s43, 4
-; MUBUF-NEXT:    v_writelane_b32 v1, s44, 5
-; MUBUF-NEXT:    v_writelane_b32 v1, s45, 6
-; MUBUF-NEXT:    v_writelane_b32 v1, s46, 7
-; MUBUF-NEXT:    v_writelane_b32 v1, s47, 8
-; MUBUF-NEXT:    v_writelane_b32 v1, s48, 9
-; MUBUF-NEXT:    v_writelane_b32 v1, s49, 10
-; MUBUF-NEXT:    v_writelane_b32 v1, s50, 11
-; MUBUF-NEXT:    v_writelane_b32 v1, s51, 12
-; MUBUF-NEXT:    v_writelane_b32 v1, s52, 13
-; MUBUF-NEXT:    v_writelane_b32 v1, s53, 14
-; MUBUF-NEXT:    v_writelane_b32 v1, s54, 15
-; MUBUF-NEXT:    v_writelane_b32 v1, s55, 16
-; MUBUF-NEXT:    v_writelane_b32 v1, s56, 17
-; MUBUF-NEXT:    v_writelane_b32 v1, s57, 18
-; MUBUF-NEXT:    v_writelane_b32 v1, s58, 19
-; MUBUF-NEXT:    v_writelane_b32 v1, s59, 20
-; MUBUF-NEXT:    v_writelane_b32 v1, s60, 21
-; MUBUF-NEXT:    v_writelane_b32 v1, s61, 22
-; MUBUF-NEXT:    v_writelane_b32 v1, s62, 23
-; MUBUF-NEXT:    v_writelane_b32 v1, s63, 24
-; MUBUF-NEXT:    v_writelane_b32 v1, s64, 25
-; MUBUF-NEXT:    v_writelane_b32 v1, s65, 26
-; MUBUF-NEXT:    v_writelane_b32 v1, s66, 27
-; MUBUF-NEXT:    v_writelane_b32 v1, s67, 28
-; MUBUF-NEXT:    v_writelane_b32 v1, s68, 29
-; MUBUF-NEXT:    v_writelane_b32 v1, s69, 30
-; MUBUF-NEXT:    v_writelane_b32 v1, s70, 31
-; MUBUF-NEXT:    v_writelane_b32 v1, s71, 32
-; MUBUF-NEXT:    v_writelane_b32 v1, s72, 33
-; MUBUF-NEXT:    v_writelane_b32 v1, s73, 34
-; MUBUF-NEXT:    v_writelane_b32 v1, s74, 35
-; MUBUF-NEXT:    v_writelane_b32 v1, s75, 36
-; MUBUF-NEXT:    v_writelane_b32 v1, s76, 37
-; MUBUF-NEXT:    v_writelane_b32 v1, s77, 38
-; MUBUF-NEXT:    v_writelane_b32 v1, s78, 39
-; MUBUF-NEXT:    v_writelane_b32 v1, s79, 40
-; MUBUF-NEXT:    v_writelane_b32 v1, s80, 41
-; MUBUF-NEXT:    v_writelane_b32 v1, s81, 42
-; MUBUF-NEXT:    v_writelane_b32 v1, s82, 43
-; MUBUF-NEXT:    v_writelane_b32 v1, s83, 44
-; MUBUF-NEXT:    v_writelane_b32 v1, s84, 45
-; MUBUF-NEXT:    v_writelane_b32 v1, s85, 46
-; MUBUF-NEXT:    v_writelane_b32 v1, s86, 47
-; MUBUF-NEXT:    v_writelane_b32 v1, s87, 48
-; MUBUF-NEXT:    v_writelane_b32 v1, s88, 49
-; MUBUF-NEXT:    v_writelane_b32 v1, s89, 50
-; MUBUF-NEXT:    v_writelane_b32 v1, s90, 51
-; MUBUF-NEXT:    v_writelane_b32 v1, s91, 52
-; MUBUF-NEXT:    v_writelane_b32 v1, s92, 53
-; MUBUF-NEXT:    v_writelane_b32 v1, s93, 54
-; MUBUF-NEXT:    v_writelane_b32 v1, s94, 55
-; MUBUF-NEXT:    v_writelane_b32 v1, s95, 56
-; MUBUF-NEXT:    v_writelane_b32 v1, s96, 57
-; MUBUF-NEXT:    v_writelane_b32 v1, s97, 58
-; MUBUF-NEXT:    v_writelane_b32 v1, s98, 59
-; MUBUF-NEXT:    v_writelane_b32 v1, s99, 60
-; MUBUF-NEXT:    v_writelane_b32 v1, s100, 61
+; MUBUF-NEXT:    v_writelane_b32 v1, s48, 1
+; MUBUF-NEXT:    v_writelane_b32 v1, s49, 2
+; MUBUF-NEXT:    v_writelane_b32 v1, s50, 3
+; MUBUF-NEXT:    v_writelane_b32 v1, s51, 4
+; MUBUF-NEXT:    v_writelane_b32 v1, s52, 5
+; MUBUF-NEXT:    v_writelane_b32 v1, s53, 6
+; MUBUF-NEXT:    v_writelane_b32 v1, s54, 7
+; MUBUF-NEXT:    v_writelane_b32 v1, s55, 8
+; MUBUF-NEXT:    v_writelane_b32 v1, s64, 9
+; MUBUF-NEXT:    v_writelane_b32 v1, s65, 10
+; MUBUF-NEXT:    v_writelane_b32 v1, s66, 11
+; MUBUF-NEXT:    v_writelane_b32 v1, s67, 12
+; MUBUF-NEXT:    v_writelane_b32 v1, s68, 13
+; MUBUF-NEXT:    v_writelane_b32 v1, s69, 14
+; MUBUF-NEXT:    v_writelane_b32 v1, s70, 15
+; MUBUF-NEXT:    v_writelane_b32 v1, s71, 16
+; MUBUF-NEXT:    v_writelane_b32 v1, s80, 17
+; MUBUF-NEXT:    v_writelane_b32 v1, s81, 18
+; MUBUF-NEXT:    v_writelane_b32 v1, s82, 19
+; MUBUF-NEXT:    v_writelane_b32 v1, s83, 20
+; MUBUF-NEXT:    v_writelane_b32 v1, s84, 21
+; MUBUF-NEXT:    v_writelane_b32 v1, s85, 22
+; MUBUF-NEXT:    v_writelane_b32 v1, s86, 23
+; MUBUF-NEXT:    v_writelane_b32 v1, s87, 24
+; MUBUF-NEXT:    v_writelane_b32 v1, s96, 25
+; MUBUF-NEXT:    v_writelane_b32 v1, s97, 26
+; MUBUF-NEXT:    v_writelane_b32 v1, s98, 27
+; MUBUF-NEXT:    v_writelane_b32 v1, s99, 28
+; MUBUF-NEXT:    v_writelane_b32 v1, s100, 29
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT:    v_writelane_b32 v1, s101, 62
+; MUBUF-NEXT:    v_writelane_b32 v1, s101, 30
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber v41
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_writelane_b32 v1, s102, 63
+; MUBUF-NEXT:    v_writelane_b32 v1, s102, 31
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_readlane_b32 s102, v1, 63
-; MUBUF-NEXT:    v_readlane_b32 s101, v1, 62
-; MUBUF-NEXT:    v_readlane_b32 s100, v1, 61
-; MUBUF-NEXT:    v_readlane_b32 s99, v1, 60
-; MUBUF-NEXT:    v_readlane_b32 s98, v1, 59
-; MUBUF-NEXT:    v_readlane_b32 s97, v1, 58
-; MUBUF-NEXT:    v_readlane_b32 s96, v1, 57
-; MUBUF-NEXT:    v_readlane_b32 s95, v1, 56
-; MUBUF-NEXT:    v_readlane_b32 s94, v1, 55
-; MUBUF-NEXT:    v_readlane_b32 s93, v1, 54
-; MUBUF-NEXT:    v_readlane_b32 s92, v1, 53
-; MUBUF-NEXT:    v_readlane_b32 s91, v1, 52
-; MUBUF-NEXT:    v_readlane_b32 s90, v1, 51
-; MUBUF-NEXT:    v_readlane_b32 s89, v1, 50
-; MUBUF-NEXT:    v_readlane_b32 s88, v1, 49
-; MUBUF-NEXT:    v_readlane_b32 s87, v1, 48
-; MUBUF-NEXT:    v_readlane_b32 s86, v1, 47
-; MUBUF-NEXT:    v_readlane_b32 s85, v1, 46
-; MUBUF-NEXT:    v_readlane_b32 s84, v1, 45
-; MUBUF-NEXT:    v_readlane_b32 s83, v1, 44
-; MUBUF-NEXT:    v_readlane_b32 s82, v1, 43
-; MUBUF-NEXT:    v_readlane_b32 s81, v1, 42
-; MUBUF-NEXT:    v_readlane_b32 s80, v1, 41
-; MUBUF-NEXT:    v_readlane_b32 s79, v1, 40
-; MUBUF-NEXT:    v_readlane_b32 s78, v1, 39
-; MUBUF-NEXT:    v_readlane_b32 s77, v1, 38
-; MUBUF-NEXT:    v_readlane_b32 s76, v1, 37
-; MUBUF-NEXT:    v_readlane_b32 s75, v1, 36
-; MUBUF-NEXT:    v_readlane_b32 s74, v1, 35
-; MUBUF-NEXT:    v_readlane_b32 s73, v1, 34
-; MUBUF-NEXT:    v_readlane_b32 s72, v1, 33
-; MUBUF-NEXT:    v_readlane_b32 s71, v1, 32
-; MUBUF-NEXT:    v_readlane_b32 s70, v1, 31
-; MUBUF-NEXT:    v_readlane_b32 s69, v1, 30
-; MUBUF-NEXT:    v_readlane_b32 s68, v1, 29
-; MUBUF-NEXT:    v_readlane_b32 s67, v1, 28
-; MUBUF-NEXT:    v_readlane_b32 s66, v1, 27
-; MUBUF-NEXT:    v_readlane_b32 s65, v1, 26
-; MUBUF-NEXT:    v_readlane_b32 s64, v1, 25
-; MUBUF-NEXT:    v_readlane_b32 s63, v1, 24
-; MUBUF-NEXT:    v_readlane_b32 s62, v1, 23
-; MUBUF-NEXT:    v_readlane_b32 s61, v1, 22
-; MUBUF-NEXT:    v_readlane_b32 s60, v1, 21
-; MUBUF-NEXT:    v_readlane_b32 s59, v1, 20
-; MUBUF-NEXT:    v_readlane_b32 s58, v1, 19
-; MUBUF-NEXT:    v_readlane_b32 s57, v1, 18
-; MUBUF-NEXT:    v_readlane_b32 s56, v1, 17
-; MUBUF-NEXT:    v_readlane_b32 s55, v1, 16
-; MUBUF-NEXT:    v_readlane_b32 s54, v1, 15
-; MUBUF-NEXT:    v_readlane_b32 s53, v1, 14
-; MUBUF-NEXT:    v_readlane_b32 s52, v1, 13
-; MUBUF-NEXT:    v_readlane_b32 s51, v1, 12
-; MUBUF-NEXT:    v_readlane_b32 s50, v1, 11
-; MUBUF-NEXT:    v_readlane_b32 s49, v1, 10
-; MUBUF-NEXT:    v_readlane_b32 s48, v1, 9
-; MUBUF-NEXT:    v_readlane_b32 s47, v1, 8
-; MUBUF-NEXT:    v_readlane_b32 s46, v1, 7
-; MUBUF-NEXT:    v_readlane_b32 s45, v1, 6
-; MUBUF-NEXT:    v_readlane_b32 s44, v1, 5
-; MUBUF-NEXT:    v_readlane_b32 s43, v1, 4
-; MUBUF-NEXT:    v_readlane_b32 s42, v1, 3
-; MUBUF-NEXT:    v_readlane_b32 s41, v1, 2
-; MUBUF-NEXT:    v_readlane_b32 s40, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s102, v1, 31
+; MUBUF-NEXT:    v_readlane_b32 s101, v1, 30
+; MUBUF-NEXT:    v_readlane_b32 s100, v1, 29
+; MUBUF-NEXT:    v_readlane_b32 s99, v1, 28
+; MUBUF-NEXT:    v_readlane_b32 s98, v1, 27
+; MUBUF-NEXT:    v_readlane_b32 s97, v1, 26
+; MUBUF-NEXT:    v_readlane_b32 s96, v1, 25
+; MUBUF-NEXT:    v_readlane_b32 s87, v1, 24
+; MUBUF-NEXT:    v_readlane_b32 s86, v1, 23
+; MUBUF-NEXT:    v_readlane_b32 s85, v1, 22
+; MUBUF-NEXT:    v_readlane_b32 s84, v1, 21
+; MUBUF-NEXT:    v_readlane_b32 s83, v1, 20
+; MUBUF-NEXT:    v_readlane_b32 s82, v1, 19
+; MUBUF-NEXT:    v_readlane_b32 s81, v1, 18
+; MUBUF-NEXT:    v_readlane_b32 s80, v1, 17
+; MUBUF-NEXT:    v_readlane_b32 s71, v1, 16
+; MUBUF-NEXT:    v_readlane_b32 s70, v1, 15
+; MUBUF-NEXT:    v_readlane_b32 s69, v1, 14
+; MUBUF-NEXT:    v_readlane_b32 s68, v1, 13
+; MUBUF-NEXT:    v_readlane_b32 s67, v1, 12
+; MUBUF-NEXT:    v_readlane_b32 s66, v1, 11
+; MUBUF-NEXT:    v_readlane_b32 s65, v1, 10
+; MUBUF-NEXT:    v_readlane_b32 s64, v1, 9
+; MUBUF-NEXT:    v_readlane_b32 s55, v1, 8
+; MUBUF-NEXT:    v_readlane_b32 s54, v1, 7
+; MUBUF-NEXT:    v_readlane_b32 s53, v1, 6
+; MUBUF-NEXT:    v_readlane_b32 s52, v1, 5
+; MUBUF-NEXT:    v_readlane_b32 s51, v1, 4
+; MUBUF-NEXT:    v_readlane_b32 s50, v1, 3
+; MUBUF-NEXT:    v_readlane_b32 s49, v1, 2
+; MUBUF-NEXT:    v_readlane_b32 s48, v1, 1
 ; MUBUF-NEXT:    v_readlane_b32 s39, v1, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
@@ -1141,143 +823,79 @@ define void @no_new_vgpr_for_fp_csr() #1 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v1, s39, 0
-; FLATSCR-NEXT:    v_writelane_b32 v1, s40, 1
-; FLATSCR-NEXT:    v_writelane_b32 v1, s41, 2
-; FLATSCR-NEXT:    v_writelane_b32 v1, s42, 3
-; FLATSCR-NEXT:    v_writelane_b32 v1, s43, 4
-; FLATSCR-NEXT:    v_writelane_b32 v1, s44, 5
-; FLATSCR-NEXT:    v_writelane_b32 v1, s45, 6
-; FLATSCR-NEXT:    v_writelane_b32 v1, s46, 7
-; FLATSCR-NEXT:    v_writelane_b32 v1, s47, 8
-; FLATSCR-NEXT:    v_writelane_b32 v1, s48, 9
-; FLATSCR-NEXT:    v_writelane_b32 v1, s49, 10
-; FLATSCR-NEXT:    v_writelane_b32 v1, s50, 11
-; FLATSCR-NEXT:    v_writelane_b32 v1, s51, 12
-; FLATSCR-NEXT:    v_writelane_b32 v1, s52, 13
-; FLATSCR-NEXT:    v_writelane_b32 v1, s53, 14
-; FLATSCR-NEXT:    v_writelane_b32 v1, s54, 15
-; FLATSCR-NEXT:    v_writelane_b32 v1, s55, 16
-; FLATSCR-NEXT:    v_writelane_b32 v1, s56, 17
-; FLATSCR-NEXT:    v_writelane_b32 v1, s57, 18
-; FLATSCR-NEXT:    v_writelane_b32 v1, s58, 19
-; FLATSCR-NEXT:    v_writelane_b32 v1, s59, 20
-; FLATSCR-NEXT:    v_writelane_b32 v1, s60, 21
-; FLATSCR-NEXT:    v_writelane_b32 v1, s61, 22
-; FLATSCR-NEXT:    v_writelane_b32 v1, s62, 23
-; FLATSCR-NEXT:    v_writelane_b32 v1, s63, 24
-; FLATSCR-NEXT:    v_writelane_b32 v1, s64, 25
-; FLATSCR-NEXT:    v_writelane_b32 v1, s65, 26
-; FLATSCR-NEXT:    v_writelane_b32 v1, s66, 27
-; FLATSCR-NEXT:    v_writelane_b32 v1, s67, 28
-; FLATSCR-NEXT:    v_writelane_b32 v1, s68, 29
-; FLATSCR-NEXT:    v_writelane_b32 v1, s69, 30
-; FLATSCR-NEXT:    v_writelane_b32 v1, s70, 31
-; FLATSCR-NEXT:    v_writelane_b32 v1, s71, 32
-; FLATSCR-NEXT:    v_writelane_b32 v1, s72, 33
-; FLATSCR-NEXT:    v_writelane_b32 v1, s73, 34
-; FLATSCR-NEXT:    v_writelane_b32 v1, s74, 35
-; FLATSCR-NEXT:    v_writelane_b32 v1, s75, 36
-; FLATSCR-NEXT:    v_writelane_b32 v1, s76, 37
-; FLATSCR-NEXT:    v_writelane_b32 v1, s77, 38
-; FLATSCR-NEXT:    v_writelane_b32 v1, s78, 39
-; FLATSCR-NEXT:    v_writelane_b32 v1, s79, 40
-; FLATSCR-NEXT:    v_writelane_b32 v1, s80, 41
-; FLATSCR-NEXT:    v_writelane_b32 v1, s81, 42
-; FLATSCR-NEXT:    v_writelane_b32 v1, s82, 43
-; FLATSCR-NEXT:    v_writelane_b32 v1, s83, 44
-; FLATSCR-NEXT:    v_writelane_b32 v1, s84, 45
-; FLATSCR-NEXT:    v_writelane_b32 v1, s85, 46
-; FLATSCR-NEXT:    v_writelane_b32 v1, s86, 47
-; FLATSCR-NEXT:    v_writelane_b32 v1, s87, 48
-; FLATSCR-NEXT:    v_writelane_b32 v1, s88, 49
-; FLATSCR-NEXT:    v_writelane_b32 v1, s89, 50
-; FLATSCR-NEXT:    v_writelane_b32 v1, s90, 51
-; FLATSCR-NEXT:    v_writelane_b32 v1, s91, 52
-; FLATSCR-NEXT:    v_writelane_b32 v1, s92, 53
-; FLATSCR-NEXT:    v_writelane_b32 v1, s93, 54
-; FLATSCR-NEXT:    v_writelane_b32 v1, s94, 55
-; FLATSCR-NEXT:    v_writelane_b32 v1, s95, 56
-; FLATSCR-NEXT:    v_writelane_b32 v1, s96, 57
-; FLATSCR-NEXT:    v_writelane_b32 v1, s97, 58
-; FLATSCR-NEXT:    v_writelane_b32 v1, s98, 59
-; FLATSCR-NEXT:    v_writelane_b32 v1, s99, 60
-; FLATSCR-NEXT:    v_writelane_b32 v1, s100, 61
+; FLATSCR-NEXT:    v_writelane_b32 v1, s48, 1
+; FLATSCR-NEXT:    v_writelane_b32 v1, s49, 2
+; FLATSCR-NEXT:    v_writelane_b32 v1, s50, 3
+; FLATSCR-NEXT:    v_writelane_b32 v1, s51, 4
+; FLATSCR-NEXT:    v_writelane_b32 v1, s52, 5
+; FLATSCR-NEXT:    v_writelane_b32 v1, s53, 6
+; FLATSCR-NEXT:    v_writelane_b32 v1, s54, 7
+; FLATSCR-NEXT:    v_writelane_b32 v1, s55, 8
+; FLATSCR-NEXT:    v_writelane_b32 v1, s64, 9
+; FLATSCR-NEXT:    v_writelane_b32 v1, s65, 10
+; FLATSCR-NEXT:    v_writelane_b32 v1, s66, 11
+; FLATSCR-NEXT:    v_writelane_b32 v1, s67, 12
+; FLATSCR-NEXT:    v_writelane_b32 v1, s68, 13
+; FLATSCR-NEXT:    v_writelane_b32 v1, s69, 14
+; FLATSCR-NEXT:    v_writelane_b32 v1, s70, 15
+; FLATSCR-NEXT:    v_writelane_b32 v1, s71, 16
+; FLATSCR-NEXT:    v_writelane_b32 v1, s80, 17
+; FLATSCR-NEXT:    v_writelane_b32 v1, s81, 18
+; FLATSCR-NEXT:    v_writelane_b32 v1, s82, 19
+; FLATSCR-NEXT:    v_writelane_b32 v1, s83, 20
+; FLATSCR-NEXT:    v_writelane_b32 v1, s84, 21
+; FLATSCR-NEXT:    v_writelane_b32 v1, s85, 22
+; FLATSCR-NEXT:    v_writelane_b32 v1, s86, 23
+; FLATSCR-NEXT:    v_writelane_b32 v1, s87, 24
+; FLATSCR-NEXT:    v_writelane_b32 v1, s96, 25
+; FLATSCR-NEXT:    v_writelane_b32 v1, s97, 26
+; FLATSCR-NEXT:    v_writelane_b32 v1, s98, 27
+; FLATSCR-NEXT:    v_writelane_b32 v1, s99, 28
+; FLATSCR-NEXT:    v_writelane_b32 v1, s100, 29
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT:    v_writelane_b32 v1, s101, 62
+; FLATSCR-NEXT:    v_writelane_b32 v1, s101, 30
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber v41
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_writelane_b32 v1, s102, 63
+; FLATSCR-NEXT:    v_writelane_b32 v1, s102, 31
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_readlane_b32 s102, v1, 63
-; FLATSCR-NEXT:    v_readlane_b32 s101, v1, 62
-; FLATSCR-NEXT:    v_readlane_b32 s100, v1, 61
-; FLATSCR-NEXT:    v_readlane_b32 s99, v1, 60
-; FLATSCR-NEXT:    v_readlane_b32 s98, v1, 59
-; FLATSCR-NEXT:    v_readlane_b32 s97, v1, 58
-; FLATSCR-NEXT:    v_readlane_b32 s96, v1, 57
-; FLATSCR-NEXT:    v_readlane_b32 s95, v1, 56
-; FLATSCR-NEXT:    v_readlane_b32 s94, v1, 55
-; FLATSCR-NEXT:    v_readlane_b32 s93, v1, 54
-; FLATSCR-NEXT:    v_readlane_b32 s92, v1, 53
-; FLATSCR-NEXT:    v_readlane_b32 s91, v1, 52
-; FLATSCR-NEXT:    v_readlane_b32 s90, v1, 51
-; FLATSCR-NEXT:    v_readlane_b32 s89, v1, 50
-; FLATSCR-NEXT:    v_readlane_b32 s88, v1, 49
-; FLATSCR-NEXT:    v_readlane_b32 s87, v1, 48
-; FLATSCR-NEXT:    v_readlane_b32 s86, v1, 47
-; FLATSCR-NEXT:    v_readlane_b32 s85, v1, 46
-; FLATSCR-NEXT:    v_readlane_b32 s84, v1, 45
-; FLATSCR-NEXT:    v_readlane_b32 s83, v1, 44
-; FLATSCR-NEXT:    v_readlane_b32 s82, v1, 43
-; FLATSCR-NEXT:    v_readlane_b32 s81, v1, 42
-; FLATSCR-NEXT:    v_readlane_b32 s80, v1, 41
-; FLATSCR-NEXT:    v_readlane_b32 s79, v1, 40
-; FLATSCR-NEXT:    v_readlane_b32 s78, v1, 39
-; FLATSCR-NEXT:    v_readlane_b32 s77, v1, 38
-; FLATSCR-NEXT:    v_readlane_b32 s76, v1, 37
-; FLATSCR-NEXT:    v_readlane_b32 s75, v1, 36
-; FLATSCR-NEXT:    v_readlane_b32 s74, v1, 35
-; FLATSCR-NEXT:    v_readlane_b32 s73, v1, 34
-; FLATSCR-NEXT:    v_readlane_b32 s72, v1, 33
-; FLATSCR-NEXT:    v_readlane_b32 s71, v1, 32
-; FLATSCR-NEXT:    v_readlane_b32 s70, v1, 31
-; FLATSCR-NEXT:    v_readlane_b32 s69, v1, 30
-; FLATSCR-NEXT:    v_readlane_b32 s68, v1, 29
-; FLATSCR-NEXT:    v_readlane_b32 s67, v1, 28
-; FLATSCR-NEXT:    v_readlane_b32 s66, v1, 27
-; FLATSCR-NEXT:    v_readlane_b32 s65, v1, 26
-; FLATSCR-NEXT:    v_readlane_b32 s64, v1, 25
-; FLATSCR-NEXT:    v_readlane_b32 s63, v1, 24
-; FLATSCR-NEXT:    v_readlane_b32 s62, v1, 23
-; FLATSCR-NEXT:    v_readlane_b32 s61, v1, 22
-; FLATSCR-NEXT:    v_readlane_b32 s60, v1, 21
-; FLATSCR-NEXT:    v_readlane_b32 s59, v1, 20
-; FLATSCR-NEXT:    v_readlane_b32 s58, v1, 19
-; FLATSCR-NEXT:    v_readlane_b32 s57, v1, 18
-; FLATSCR-NEXT:    v_readlane_b32 s56, v1, 17
-; FLATSCR-NEXT:    v_readlane_b32 s55, v1, 16
-; FLATSCR-NEXT:    v_readlane_b32 s54, v1, 15
-; FLATSCR-NEXT:    v_readlane_b32 s53, v1, 14
-; FLATSCR-NEXT:    v_readlane_b32 s52, v1, 13
-; FLATSCR-NEXT:    v_readlane_b32 s51, v1, 12
-; FLATSCR-NEXT:    v_readlane_b32 s50, v1, 11
-; FLATSCR-NEXT:    v_readlane_b32 s49, v1, 10
-; FLATSCR-NEXT:    v_readlane_b32 s48, v1, 9
-; FLATSCR-NEXT:    v_readlane_b32 s47, v1, 8
-; FLATSCR-NEXT:    v_readlane_b32 s46, v1, 7
-; FLATSCR-NEXT:    v_readlane_b32 s45, v1, 6
-; FLATSCR-NEXT:    v_readlane_b32 s44, v1, 5
-; FLATSCR-NEXT:    v_readlane_b32 s43, v1, 4
-; FLATSCR-NEXT:    v_readlane_b32 s42, v1, 3
-; FLATSCR-NEXT:    v_readlane_b32 s41, v1, 2
-; FLATSCR-NEXT:    v_readlane_b32 s40, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s102, v1, 31
+; FLATSCR-NEXT:    v_readlane_b32 s101, v1, 30
+; FLATSCR-NEXT:    v_readlane_b32 s100, v1, 29
+; FLATSCR-NEXT:    v_readlane_b32 s99, v1, 28
+; FLATSCR-NEXT:    v_readlane_b32 s98, v1, 27
+; FLATSCR-NEXT:    v_readlane_b32 s97, v1, 26
+; FLATSCR-NEXT:    v_readlane_b32 s96, v1, 25
+; FLATSCR-NEXT:    v_readlane_b32 s87, v1, 24
+; FLATSCR-NEXT:    v_readlane_b32 s86, v1, 23
+; FLATSCR-NEXT:    v_readlane_b32 s85, v1, 22
+; FLATSCR-NEXT:    v_readlane_b32 s84, v1, 21
+; FLATSCR-NEXT:    v_readlane_b32 s83, v1, 20
+; FLATSCR-NEXT:    v_readlane_b32 s82, v1, 19
+; FLATSCR-NEXT:    v_readlane_b32 s81, v1, 18
+; FLATSCR-NEXT:    v_readlane_b32 s80, v1, 17
+; FLATSCR-NEXT:    v_readlane_b32 s71, v1, 16
+; FLATSCR-NEXT:    v_readlane_b32 s70, v1, 15
+; FLATSCR-NEXT:    v_readlane_b32 s69, v1, 14
+; FLATSCR-NEXT:    v_readlane_b32 s68, v1, 13
+; FLATSCR-NEXT:    v_readlane_b32 s67, v1, 12
+; FLATSCR-NEXT:    v_readlane_b32 s66, v1, 11
+; FLATSCR-NEXT:    v_readlane_b32 s65, v1, 10
+; FLATSCR-NEXT:    v_readlane_b32 s64, v1, 9
+; FLATSCR-NEXT:    v_readlane_b32 s55, v1, 8
+; FLATSCR-NEXT:    v_readlane_b32 s54, v1, 7
+; FLATSCR-NEXT:    v_readlane_b32 s53, v1, 6
+; FLATSCR-NEXT:    v_readlane_b32 s52, v1, 5
+; FLATSCR-NEXT:    v_readlane_b32 s51, v1, 4
+; FLATSCR-NEXT:    v_readlane_b32 s50, v1, 3
+; FLATSCR-NEXT:    v_readlane_b32 s49, v1, 2
+; FLATSCR-NEXT:    v_readlane_b32 s48, v1, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s39, v1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
@@ -1346,7 +964,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT:    s_mov_b32 vcc_lo, s33
+; MUBUF-NEXT:    s_mov_b32 s40, s33
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -1365,14 +983,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
-; MUBUF-NEXT:    s_mov_b32 s33, vcc_lo
+; MUBUF-NEXT:    s_mov_b32 s33, s40
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp:
 ; FLATSCR:       ; %bb.0:
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    s_mov_b32 vcc_lo, s33
+; FLATSCR-NEXT:    s_mov_b32 s40, s33
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill
@@ -1391,7 +1009,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT:    s_mov_b32 s33, vcc_lo
+; FLATSCR-NEXT:    s_mov_b32 s33, s40
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
@@ -1412,7 +1030,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT:    s_mov_b32 vcc_lo, s33
+; MUBUF-NEXT:    s_mov_b32 s40, s33
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -1434,14 +1052,14 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
-; MUBUF-NEXT:    s_mov_b32 s33, vcc_lo
+; MUBUF-NEXT:    s_mov_b32 s33, s40
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
 ; FLATSCR:       ; %bb.0:
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    s_mov_b32 vcc_lo, s33
+; FLATSCR-NEXT:    s_mov_b32 s40, s33
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
@@ -1463,7 +1081,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT:    s_mov_b32 s33, vcc_lo
+; FLATSCR-NEXT:    s_mov_b32 s33, s40
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
@@ -1491,7 +1109,7 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
 ; MUBUF-LABEL: scratch_reg_needed_mubuf_offset:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT:    s_mov_b32 vcc_lo, s33
+; MUBUF-NEXT:    s_mov_b32 s40, s33
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    s_add_i32 s6, s33, 0x40100
@@ -1517,14 +1135,14 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
 ; MUBUF-NEXT:    s_add_i32 s6, s33, 0x40100
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s6 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
-; MUBUF-NEXT:    s_mov_b32 s33, vcc_lo
+; MUBUF-NEXT:    s_mov_b32 s33, s40
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: scratch_reg_needed_mubuf_offset:
 ; FLATSCR:       ; %bb.0:
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    s_mov_b32 vcc_lo, s33
+; FLATSCR-NEXT:    s_mov_b32 s40, s33
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x1004
@@ -1550,7 +1168,7 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
 ; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x1004
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s2 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT:    s_mov_b32 s33, vcc_lo
+; FLATSCR-NEXT:    s_mov_b32 s33, s40
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
@@ -1650,22 +1268,15 @@ define void @callee_need_to_spill_fp_to_memory() #3 {
 ; MUBUF-LABEL: callee_need_to_spill_fp_to_memory:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s40, s33
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
-; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved SGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber all VGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT:    s_addk_i32 s32, 0x200
-; MUBUF-NEXT:    s_mov_b32 s32, s33
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_readfirstlane_b32 s4, v0
-; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_mov_b32 s33, s40
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory:
@@ -1707,147 +1318,80 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v39, s4, 32
 ; MUBUF-NEXT:    v_writelane_b32 v39, s39, 0
-; MUBUF-NEXT:    v_writelane_b32 v39, s40, 1
-; MUBUF-NEXT:    v_writelane_b32 v39, s41, 2
-; MUBUF-NEXT:    v_writelane_b32 v39, s42, 3
-; MUBUF-NEXT:    v_writelane_b32 v39, s43, 4
-; MUBUF-NEXT:    v_writelane_b32 v39, s44, 5
-; MUBUF-NEXT:    v_writelane_b32 v39, s45, 6
-; MUBUF-NEXT:    v_writelane_b32 v39, s46, 7
-; MUBUF-NEXT:    v_writelane_b32 v39, s47, 8
-; MUBUF-NEXT:    v_writelane_b32 v39, s48, 9
-; MUBUF-NEXT:    v_writelane_b32 v39, s49, 10
-; MUBUF-NEXT:    v_writelane_b32 v39, s50, 11
-; MUBUF-NEXT:    v_writelane_b32 v39, s51, 12
-; MUBUF-NEXT:    v_writelane_b32 v39, s52, 13
-; MUBUF-NEXT:    v_writelane_b32 v39, s53, 14
-; MUBUF-NEXT:    v_writelane_b32 v39, s54, 15
-; MUBUF-NEXT:    v_writelane_b32 v39, s55, 16
-; MUBUF-NEXT:    v_writelane_b32 v39, s56, 17
-; MUBUF-NEXT:    v_writelane_b32 v39, s57, 18
-; MUBUF-NEXT:    v_writelane_b32 v39, s58, 19
-; MUBUF-NEXT:    v_writelane_b32 v39, s59, 20
-; MUBUF-NEXT:    v_writelane_b32 v39, s60, 21
-; MUBUF-NEXT:    v_writelane_b32 v39, s61, 22
-; MUBUF-NEXT:    v_writelane_b32 v39, s62, 23
-; MUBUF-NEXT:    v_writelane_b32 v39, s63, 24
-; MUBUF-NEXT:    v_writelane_b32 v39, s64, 25
-; MUBUF-NEXT:    v_writelane_b32 v39, s65, 26
-; MUBUF-NEXT:    v_writelane_b32 v39, s66, 27
-; MUBUF-NEXT:    v_writelane_b32 v39, s67, 28
-; MUBUF-NEXT:    v_writelane_b32 v39, s68, 29
-; MUBUF-NEXT:    v_writelane_b32 v39, s69, 30
-; MUBUF-NEXT:    v_writelane_b32 v39, s70, 31
-; MUBUF-NEXT:    v_writelane_b32 v39, s71, 32
-; MUBUF-NEXT:    v_writelane_b32 v39, s72, 33
-; MUBUF-NEXT:    v_writelane_b32 v39, s73, 34
-; MUBUF-NEXT:    v_writelane_b32 v39, s74, 35
-; MUBUF-NEXT:    v_writelane_b32 v39, s75, 36
-; MUBUF-NEXT:    v_writelane_b32 v39, s76, 37
-; MUBUF-NEXT:    v_writelane_b32 v39, s77, 38
-; MUBUF-NEXT:    v_writelane_b32 v39, s78, 39
-; MUBUF-NEXT:    v_writelane_b32 v39, s79, 40
-; MUBUF-NEXT:    v_writelane_b32 v39, s80, 41
-; MUBUF-NEXT:    v_writelane_b32 v39, s81, 42
-; MUBUF-NEXT:    v_writelane_b32 v39, s82, 43
-; MUBUF-NEXT:    v_writelane_b32 v39, s83, 44
-; MUBUF-NEXT:    v_writelane_b32 v39, s84, 45
-; MUBUF-NEXT:    v_writelane_b32 v39, s85, 46
-; MUBUF-NEXT:    v_writelane_b32 v39, s86, 47
-; MUBUF-NEXT:    v_writelane_b32 v39, s87, 48
-; MUBUF-NEXT:    v_writelane_b32 v39, s88, 49
-; MUBUF-NEXT:    v_writelane_b32 v39, s89, 50
-; MUBUF-NEXT:    v_writelane_b32 v39, s90, 51
-; MUBUF-NEXT:    v_writelane_b32 v39, s91, 52
-; MUBUF-NEXT:    v_writelane_b32 v39, s92, 53
-; MUBUF-NEXT:    v_writelane_b32 v39, s93, 54
-; MUBUF-NEXT:    v_writelane_b32 v39, s94, 55
-; MUBUF-NEXT:    v_writelane_b32 v39, s95, 56
-; MUBUF-NEXT:    v_writelane_b32 v39, s96, 57
-; MUBUF-NEXT:    v_writelane_b32 v39, s97, 58
-; MUBUF-NEXT:    v_writelane_b32 v39, s98, 59
-; MUBUF-NEXT:    v_writelane_b32 v39, s99, 60
-; MUBUF-NEXT:    v_writelane_b32 v39, s100, 61
-; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
-; MUBUF-NEXT:    v_writelane_b32 v39, s101, 62
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; MUBUF-NEXT:    v_writelane_b32 v39, s102, 63
+; MUBUF-NEXT:    v_writelane_b32 v39, s48, 1
+; MUBUF-NEXT:    v_writelane_b32 v39, s49, 2
+; MUBUF-NEXT:    v_writelane_b32 v39, s50, 3
+; MUBUF-NEXT:    v_writelane_b32 v39, s51, 4
+; MUBUF-NEXT:    v_writelane_b32 v39, s52, 5
+; MUBUF-NEXT:    v_writelane_b32 v39, s53, 6
+; MUBUF-NEXT:    v_writelane_b32 v39, s54, 7
+; MUBUF-NEXT:    v_writelane_b32 v39, s55, 8
+; MUBUF-NEXT:    v_writelane_b32 v39, s64, 9
+; MUBUF-NEXT:    v_writelane_b32 v39, s65, 10
+; MUBUF-NEXT:    v_writelane_b32 v39, s66, 11
+; MUBUF-NEXT:    v_writelane_b32 v39, s67, 12
+; MUBUF-NEXT:    v_writelane_b32 v39, s68, 13
+; MUBUF-NEXT:    v_writelane_b32 v39, s69, 14
+; MUBUF-NEXT:    v_writelane_b32 v39, s70, 15
+; MUBUF-NEXT:    v_writelane_b32 v39, s71, 16
+; MUBUF-NEXT:    v_writelane_b32 v39, s80, 17
+; MUBUF-NEXT:    v_writelane_b32 v39, s81, 18
+; MUBUF-NEXT:    v_writelane_b32 v39, s82, 19
+; MUBUF-NEXT:    v_writelane_b32 v39, s83, 20
+; MUBUF-NEXT:    v_writelane_b32 v39, s84, 21
+; MUBUF-NEXT:    v_writelane_b32 v39, s85, 22
+; MUBUF-NEXT:    v_writelane_b32 v39, s86, 23
+; MUBUF-NEXT:    v_writelane_b32 v39, s87, 24
+; MUBUF-NEXT:    v_writelane_b32 v39, s96, 25
+; MUBUF-NEXT:    v_writelane_b32 v39, s97, 26
+; MUBUF-NEXT:    v_writelane_b32 v39, s98, 27
+; MUBUF-NEXT:    v_writelane_b32 v39, s99, 28
+; MUBUF-NEXT:    v_writelane_b32 v39, s100, 29
+; MUBUF-NEXT:    v_writelane_b32 v39, s101, 30
+; MUBUF-NEXT:    s_addk_i32 s32, 0x200
+; MUBUF-NEXT:    v_writelane_b32 v39, s102, 31
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber all VGPRs except CSR v40
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; MUBUF-NEXT:    s_addk_i32 s32, 0x300
-; MUBUF-NEXT:    v_readlane_b32 s102, v39, 63
-; MUBUF-NEXT:    v_readlane_b32 s101, v39, 62
-; MUBUF-NEXT:    v_readlane_b32 s100, v39, 61
-; MUBUF-NEXT:    v_readlane_b32 s99, v39, 60
-; MUBUF-NEXT:    v_readlane_b32 s98, v39, 59
-; MUBUF-NEXT:    v_readlane_b32 s97, v39, 58
-; MUBUF-NEXT:    v_readlane_b32 s96, v39, 57
-; MUBUF-NEXT:    v_readlane_b32 s95, v39, 56
-; MUBUF-NEXT:    v_readlane_b32 s94, v39, 55
-; MUBUF-NEXT:    v_readlane_b32 s93, v39, 54
-; MUBUF-NEXT:    v_readlane_b32 s92, v39, 53
-; MUBUF-NEXT:    v_readlane_b32 s91, v39, 52
-; MUBUF-NEXT:    v_readlane_b32 s90, v39, 51
-; MUBUF-NEXT:    v_readlane_b32 s89, v39, 50
-; MUBUF-NEXT:    v_readlane_b32 s88, v39, 49
-; MUBUF-NEXT:    v_readlane_b32 s87, v39, 48
-; MUBUF-NEXT:    v_readlane_b32 s86, v39, 47
-; MUBUF-NEXT:    v_readlane_b32 s85, v39, 46
-; MUBUF-NEXT:    v_readlane_b32 s84, v39, 45
-; MUBUF-NEXT:    v_readlane_b32 s83, v39, 44
-; MUBUF-NEXT:    v_readlane_b32 s82, v39, 43
-; MUBUF-NEXT:    v_readlane_b32 s81, v39, 42
-; MUBUF-NEXT:    v_readlane_b32 s80, v39, 41
-; MUBUF-NEXT:    v_readlane_b32 s79, v39, 40
-; MUBUF-NEXT:    v_readlane_b32 s78, v39, 39
-; MUBUF-NEXT:    v_readlane_b32 s77, v39, 38
-; MUBUF-NEXT:    v_readlane_b32 s76, v39, 37
-; MUBUF-NEXT:    v_readlane_b32 s75, v39, 36
-; MUBUF-NEXT:    v_readlane_b32 s74, v39, 35
-; MUBUF-NEXT:    v_readlane_b32 s73, v39, 34
-; MUBUF-NEXT:    v_readlane_b32 s72, v39, 33
-; MUBUF-NEXT:    v_readlane_b32 s71, v39, 32
-; MUBUF-NEXT:    v_readlane_b32 s70, v39, 31
-; MUBUF-NEXT:    v_readlane_b32 s69, v39, 30
-; MUBUF-NEXT:    v_readlane_b32 s68, v39, 29
-; MUBUF-NEXT:    v_readlane_b32 s67, v39, 28
-; MUBUF-NEXT:    v_readlane_b32 s66, v39, 27
-; MUBUF-NEXT:    v_readlane_b32 s65, v39, 26
-; MUBUF-NEXT:    v_readlane_b32 s64, v39, 25
-; MUBUF-NEXT:    v_readlane_b32 s63, v39, 24
-; MUBUF-NEXT:    v_readlane_b32 s62, v39, 23
-; MUBUF-NEXT:    v_readlane_b32 s61, v39, 22
-; MUBUF-NEXT:    v_readlane_b32 s60, v39, 21
-; MUBUF-NEXT:    v_readlane_b32 s59, v39, 20
-; MUBUF-NEXT:    v_readlane_b32 s58, v39, 19
-; MUBUF-NEXT:    v_readlane_b32 s57, v39, 18
-; MUBUF-NEXT:    v_readlane_b32 s56, v39, 17
-; MUBUF-NEXT:    v_readlane_b32 s55, v39, 16
-; MUBUF-NEXT:    v_readlane_b32 s54, v39, 15
-; MUBUF-NEXT:    v_readlane_b32 s53, v39, 14
-; MUBUF-NEXT:    v_readlane_b32 s52, v39, 13
-; MUBUF-NEXT:    v_readlane_b32 s51, v39, 12
-; MUBUF-NEXT:    v_readlane_b32 s50, v39, 11
-; MUBUF-NEXT:    v_readlane_b32 s49, v39, 10
-; MUBUF-NEXT:    v_readlane_b32 s48, v39, 9
-; MUBUF-NEXT:    v_readlane_b32 s47, v39, 8
-; MUBUF-NEXT:    v_readlane_b32 s46, v39, 7
-; MUBUF-NEXT:    v_readlane_b32 s45, v39, 6
-; MUBUF-NEXT:    v_readlane_b32 s44, v39, 5
-; MUBUF-NEXT:    v_readlane_b32 s43, v39, 4
-; MUBUF-NEXT:    v_readlane_b32 s42, v39, 3
-; MUBUF-NEXT:    v_readlane_b32 s41, v39, 2
-; MUBUF-NEXT:    v_readlane_b32 s40, v39, 1
+; MUBUF-NEXT:    v_readlane_b32 s102, v39, 31
+; MUBUF-NEXT:    v_readlane_b32 s101, v39, 30
+; MUBUF-NEXT:    v_readlane_b32 s100, v39, 29
+; MUBUF-NEXT:    v_readlane_b32 s99, v39, 28
+; MUBUF-NEXT:    v_readlane_b32 s98, v39, 27
+; MUBUF-NEXT:    v_readlane_b32 s97, v39, 26
+; MUBUF-NEXT:    v_readlane_b32 s96, v39, 25
+; MUBUF-NEXT:    v_readlane_b32 s87, v39, 24
+; MUBUF-NEXT:    v_readlane_b32 s86, v39, 23
+; MUBUF-NEXT:    v_readlane_b32 s85, v39, 22
+; MUBUF-NEXT:    v_readlane_b32 s84, v39, 21
+; MUBUF-NEXT:    v_readlane_b32 s83, v39, 20
+; MUBUF-NEXT:    v_readlane_b32 s82, v39, 19
+; MUBUF-NEXT:    v_readlane_b32 s81, v39, 18
+; MUBUF-NEXT:    v_readlane_b32 s80, v39, 17
+; MUBUF-NEXT:    v_readlane_b32 s71, v39, 16
+; MUBUF-NEXT:    v_readlane_b32 s70, v39, 15
+; MUBUF-NEXT:    v_readlane_b32 s69, v39, 14
+; MUBUF-NEXT:    v_readlane_b32 s68, v39, 13
+; MUBUF-NEXT:    v_readlane_b32 s67, v39, 12
+; MUBUF-NEXT:    v_readlane_b32 s66, v39, 11
+; MUBUF-NEXT:    v_readlane_b32 s65, v39, 10
+; MUBUF-NEXT:    v_readlane_b32 s64, v39, 9
+; MUBUF-NEXT:    v_readlane_b32 s55, v39, 8
+; MUBUF-NEXT:    v_readlane_b32 s54, v39, 7
+; MUBUF-NEXT:    v_readlane_b32 s53, v39, 6
+; MUBUF-NEXT:    v_readlane_b32 s52, v39, 5
+; MUBUF-NEXT:    v_readlane_b32 s51, v39, 4
+; MUBUF-NEXT:    v_readlane_b32 s50, v39, 3
+; MUBUF-NEXT:    v_readlane_b32 s49, v39, 2
+; MUBUF-NEXT:    v_readlane_b32 s48, v39, 1
 ; MUBUF-NEXT:    v_readlane_b32 s39, v39, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT:    v_readlane_b32 s4, v39, 32
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
@@ -1864,139 +1408,75 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v39, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v39, s39, 0
-; FLATSCR-NEXT:    v_writelane_b32 v39, s40, 1
-; FLATSCR-NEXT:    v_writelane_b32 v39, s41, 2
-; FLATSCR-NEXT:    v_writelane_b32 v39, s42, 3
-; FLATSCR-NEXT:    v_writelane_b32 v39, s43, 4
-; FLATSCR-NEXT:    v_writelane_b32 v39, s44, 5
-; FLATSCR-NEXT:    v_writelane_b32 v39, s45, 6
-; FLATSCR-NEXT:    v_writelane_b32 v39, s46, 7
-; FLATSCR-NEXT:    v_writelane_b32 v39, s47, 8
-; FLATSCR-NEXT:    v_writelane_b32 v39, s48, 9
-; FLATSCR-NEXT:    v_writelane_b32 v39, s49, 10
-; FLATSCR-NEXT:    v_writelane_b32 v39, s50, 11
-; FLATSCR-NEXT:    v_writelane_b32 v39, s51, 12
-; FLATSCR-NEXT:    v_writelane_b32 v39, s52, 13
-; FLATSCR-NEXT:    v_writelane_b32 v39, s53, 14
-; FLATSCR-NEXT:    v_writelane_b32 v39, s54, 15
-; FLATSCR-NEXT:    v_writelane_b32 v39, s55, 16
-; FLATSCR-NEXT:    v_writelane_b32 v39, s56, 17
-; FLATSCR-NEXT:    v_writelane_b32 v39, s57, 18
-; FLATSCR-NEXT:    v_writelane_b32 v39, s58, 19
-; FLATSCR-NEXT:    v_writelane_b32 v39, s59, 20
-; FLATSCR-NEXT:    v_writelane_b32 v39, s60, 21
-; FLATSCR-NEXT:    v_writelane_b32 v39, s61, 22
-; FLATSCR-NEXT:    v_writelane_b32 v39, s62, 23
-; FLATSCR-NEXT:    v_writelane_b32 v39, s63, 24
-; FLATSCR-NEXT:    v_writelane_b32 v39, s64, 25
-; FLATSCR-NEXT:    v_writelane_b32 v39, s65, 26
-; FLATSCR-NEXT:    v_writelane_b32 v39, s66, 27
-; FLATSCR-NEXT:    v_writelane_b32 v39, s67, 28
-; FLATSCR-NEXT:    v_writelane_b32 v39, s68, 29
-; FLATSCR-NEXT:    v_writelane_b32 v39, s69, 30
-; FLATSCR-NEXT:    v_writelane_b32 v39, s70, 31
-; FLATSCR-NEXT:    v_writelane_b32 v39, s71, 32
-; FLATSCR-NEXT:    v_writelane_b32 v39, s72, 33
-; FLATSCR-NEXT:    v_writelane_b32 v39, s73, 34
-; FLATSCR-NEXT:    v_writelane_b32 v39, s74, 35
-; FLATSCR-NEXT:    v_writelane_b32 v39, s75, 36
-; FLATSCR-NEXT:    v_writelane_b32 v39, s76, 37
-; FLATSCR-NEXT:    v_writelane_b32 v39, s77, 38
-; FLATSCR-NEXT:    v_writelane_b32 v39, s78, 39
-; FLATSCR-NEXT:    v_writelane_b32 v39, s79, 40
-; FLATSCR-NEXT:    v_writelane_b32 v39, s80, 41
-; FLATSCR-NEXT:    v_writelane_b32 v39, s81, 42
-; FLATSCR-NEXT:    v_writelane_b32 v39, s82, 43
-; FLATSCR-NEXT:    v_writelane_b32 v39, s83, 44
-; FLATSCR-NEXT:    v_writelane_b32 v39, s84, 45
-; FLATSCR-NEXT:    v_writelane_b32 v39, s85, 46
-; FLATSCR-NEXT:    v_writelane_b32 v39, s86, 47
-; FLATSCR-NEXT:    v_writelane_b32 v39, s87, 48
-; FLATSCR-NEXT:    v_writelane_b32 v39, s88, 49
-; FLATSCR-NEXT:    v_writelane_b32 v39, s89, 50
-; FLATSCR-NEXT:    v_writelane_b32 v39, s90, 51
-; FLATSCR-NEXT:    v_writelane_b32 v39, s91, 52
-; FLATSCR-NEXT:    v_writelane_b32 v39, s92, 53
-; FLATSCR-NEXT:    v_writelane_b32 v39, s93, 54
-; FLATSCR-NEXT:    v_writelane_b32 v39, s94, 55
-; FLATSCR-NEXT:    v_writelane_b32 v39, s95, 56
-; FLATSCR-NEXT:    v_writelane_b32 v39, s96, 57
-; FLATSCR-NEXT:    v_writelane_b32 v39, s97, 58
-; FLATSCR-NEXT:    v_writelane_b32 v39, s98, 59
-; FLATSCR-NEXT:    v_writelane_b32 v39, s99, 60
-; FLATSCR-NEXT:    v_writelane_b32 v39, s100, 61
-; FLATSCR-NEXT:    v_writelane_b32 v39, s101, 62
+; FLATSCR-NEXT:    v_writelane_b32 v39, s48, 1
+; FLATSCR-NEXT:    v_writelane_b32 v39, s49, 2
+; FLATSCR-NEXT:    v_writelane_b32 v39, s50, 3
+; FLATSCR-NEXT:    v_writelane_b32 v39, s51, 4
+; FLATSCR-NEXT:    v_writelane_b32 v39, s52, 5
+; FLATSCR-NEXT:    v_writelane_b32 v39, s53, 6
+; FLATSCR-NEXT:    v_writelane_b32 v39, s54, 7
+; FLATSCR-NEXT:    v_writelane_b32 v39, s55, 8
+; FLATSCR-NEXT:    v_writelane_b32 v39, s64, 9
+; FLATSCR-NEXT:    v_writelane_b32 v39, s65, 10
+; FLATSCR-NEXT:    v_writelane_b32 v39, s66, 11
+; FLATSCR-NEXT:    v_writelane_b32 v39, s67, 12
+; FLATSCR-NEXT:    v_writelane_b32 v39, s68, 13
+; FLATSCR-NEXT:    v_writelane_b32 v39, s69, 14
+; FLATSCR-NEXT:    v_writelane_b32 v39, s70, 15
+; FLATSCR-NEXT:    v_writelane_b32 v39, s71, 16
+; FLATSCR-NEXT:    v_writelane_b32 v39, s80, 17
+; FLATSCR-NEXT:    v_writelane_b32 v39, s81, 18
+; FLATSCR-NEXT:    v_writelane_b32 v39, s82, 19
+; FLATSCR-NEXT:    v_writelane_b32 v39, s83, 20
+; FLATSCR-NEXT:    v_writelane_b32 v39, s84, 21
+; FLATSCR-NEXT:    v_writelane_b32 v39, s85, 22
+; FLATSCR-NEXT:    v_writelane_b32 v39, s86, 23
+; FLATSCR-NEXT:    v_writelane_b32 v39, s87, 24
+; FLATSCR-NEXT:    v_writelane_b32 v39, s96, 25
+; FLATSCR-NEXT:    v_writelane_b32 v39, s97, 26
+; FLATSCR-NEXT:    v_writelane_b32 v39, s98, 27
+; FLATSCR-NEXT:    v_writelane_b32 v39, s99, 28
+; FLATSCR-NEXT:    v_writelane_b32 v39, s100, 29
+; FLATSCR-NEXT:    v_writelane_b32 v39, s101, 30
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 8
-; FLATSCR-NEXT:    v_writelane_b32 v39, s102, 63
+; FLATSCR-NEXT:    v_writelane_b32 v39, s102, 31
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber all VGPRs except CSR v40
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s102, v39, 63
-; FLATSCR-NEXT:    v_readlane_b32 s101, v39, 62
-; FLATSCR-NEXT:    v_readlane_b32 s100, v39, 61
-; FLATSCR-NEXT:    v_readlane_b32 s99, v39, 60
-; FLATSCR-NEXT:    v_readlane_b32 s98, v39, 59
-; FLATSCR-NEXT:    v_readlane_b32 s97, v39, 58
-; FLATSCR-NEXT:    v_readlane_b32 s96, v39, 57
-; FLATSCR-NEXT:    v_readlane_b32 s95, v39, 56
-; FLATSCR-NEXT:    v_readlane_b32 s94, v39, 55
-; FLATSCR-NEXT:    v_readlane_b32 s93, v39, 54
-; FLATSCR-NEXT:    v_readlane_b32 s92, v39, 53
-; FLATSCR-NEXT:    v_readlane_b32 s91, v39, 52
-; FLATSCR-NEXT:    v_readlane_b32 s90, v39, 51
-; FLATSCR-NEXT:    v_readlane_b32 s89, v39, 50
-; FLATSCR-NEXT:    v_readlane_b32 s88, v39, 49
-; FLATSCR-NEXT:    v_readlane_b32 s87, v39, 48
-; FLATSCR-NEXT:    v_readlane_b32 s86, v39, 47
-; FLATSCR-NEXT:    v_readlane_b32 s85, v39, 46
-; FLATSCR-NEXT:    v_readlane_b32 s84, v39, 45
-; FLATSCR-NEXT:    v_readlane_b32 s83, v39, 44
-; FLATSCR-NEXT:    v_readlane_b32 s82, v39, 43
-; FLATSCR-NEXT:    v_readlane_b32 s81, v39, 42
-; FLATSCR-NEXT:    v_readlane_b32 s80, v39, 41
-; FLATSCR-NEXT:    v_readlane_b32 s79, v39, 40
-; FLATSCR-NEXT:    v_readlane_b32 s78, v39, 39
-; FLATSCR-NEXT:    v_readlane_b32 s77, v39, 38
-; FLATSCR-NEXT:    v_readlane_b32 s76, v39, 37
-; FLATSCR-NEXT:    v_readlane_b32 s75, v39, 36
-; FLATSCR-NEXT:    v_readlane_b32 s74, v39, 35
-; FLATSCR-NEXT:    v_readlane_b32 s73, v39, 34
-; FLATSCR-NEXT:    v_readlane_b32 s72, v39, 33
-; FLATSCR-NEXT:    v_readlane_b32 s71, v39, 32
-; FLATSCR-NEXT:    v_readlane_b32 s70, v39, 31
-; FLATSCR-NEXT:    v_readlane_b32 s69, v39, 30
-; FLATSCR-NEXT:    v_readlane_b32 s68, v39, 29
-; FLATSCR-NEXT:    v_readlane_b32 s67, v39, 28
-; FLATSCR-NEXT:    v_readlane_b32 s66, v39, 27
-; FLATSCR-NEXT:    v_readlane_b32 s65, v39, 26
-; FLATSCR-NEXT:    v_readlane_b32 s64, v39, 25
-; FLATSCR-NEXT:    v_readlane_b32 s63, v39, 24
-; FLATSCR-NEXT:    v_readlane_b32 s62, v39, 23
-; FLATSCR-NEXT:    v_readlane_b32 s61, v39, 22
-; FLATSCR-NEXT:    v_readlane_b32 s60, v39, 21
-; FLATSCR-NEXT:    v_readlane_b32 s59, v39, 20
-; FLATSCR-NEXT:    v_readlane_b32 s58, v39, 19
-; FLATSCR-NEXT:    v_readlane_b32 s57, v39, 18
-; FLATSCR-NEXT:    v_readlane_b32 s56, v39, 17
-; FLATSCR-NEXT:    v_readlane_b32 s55, v39, 16
-; FLATSCR-NEXT:    v_readlane_b32 s54, v39, 15
-; FLATSCR-NEXT:    v_readlane_b32 s53, v39, 14
-; FLATSCR-NEXT:    v_readlane_b32 s52, v39, 13
-; FLATSCR-NEXT:    v_readlane_b32 s51, v39, 12
-; FLATSCR-NEXT:    v_readlane_b32 s50, v39, 11
-; FLATSCR-NEXT:    v_readlane_b32 s49, v39, 10
-; FLATSCR-NEXT:    v_readlane_b32 s48, v39, 9
-; FLATSCR-NEXT:    v_readlane_b32 s47, v39, 8
-; FLATSCR-NEXT:    v_readlane_b32 s46, v39, 7
-; FLATSCR-NEXT:    v_readlane_b32 s45, v39, 6
-; FLATSCR-NEXT:    v_readlane_b32 s44, v39, 5
-; FLATSCR-NEXT:    v_readlane_b32 s43, v39, 4
-; FLATSCR-NEXT:    v_readlane_b32 s42, v39, 3
-; FLATSCR-NEXT:    v_readlane_b32 s41, v39, 2
-; FLATSCR-NEXT:    v_readlane_b32 s40, v39, 1
+; FLATSCR-NEXT:    v_readlane_b32 s102, v39, 31
+; FLATSCR-NEXT:    v_readlane_b32 s101, v39, 30
+; FLATSCR-NEXT:    v_readlane_b32 s100, v39, 29
+; FLATSCR-NEXT:    v_readlane_b32 s99, v39, 28
+; FLATSCR-NEXT:    v_readlane_b32 s98, v39, 27
+; FLATSCR-NEXT:    v_readlane_b32 s97, v39, 26
+; FLATSCR-NEXT:    v_readlane_b32 s96, v39, 25
+; FLATSCR-NEXT:    v_readlane_b32 s87, v39, 24
+; FLATSCR-NEXT:    v_readlane_b32 s86, v39, 23
+; FLATSCR-NEXT:    v_readlane_b32 s85, v39, 22
+; FLATSCR-NEXT:    v_readlane_b32 s84, v39, 21
+; FLATSCR-NEXT:    v_readlane_b32 s83, v39, 20
+; FLATSCR-NEXT:    v_readlane_b32 s82, v39, 19
+; FLATSCR-NEXT:    v_readlane_b32 s81, v39, 18
+; FLATSCR-NEXT:    v_readlane_b32 s80, v39, 17
+; FLATSCR-NEXT:    v_readlane_b32 s71, v39, 16
+; FLATSCR-NEXT:    v_readlane_b32 s70, v39, 15
+; FLATSCR-NEXT:    v_readlane_b32 s69, v39, 14
+; FLATSCR-NEXT:    v_readlane_b32 s68, v39, 13
+; FLATSCR-NEXT:    v_readlane_b32 s67, v39, 12
+; FLATSCR-NEXT:    v_readlane_b32 s66, v39, 11
+; FLATSCR-NEXT:    v_readlane_b32 s65, v39, 10
+; FLATSCR-NEXT:    v_readlane_b32 s64, v39, 9
+; FLATSCR-NEXT:    v_readlane_b32 s55, v39, 8
+; FLATSCR-NEXT:    v_readlane_b32 s54, v39, 7
+; FLATSCR-NEXT:    v_readlane_b32 s53, v39, 6
+; FLATSCR-NEXT:    v_readlane_b32 s52, v39, 5
+; FLATSCR-NEXT:    v_readlane_b32 s51, v39, 4
+; FLATSCR-NEXT:    v_readlane_b32 s50, v39, 3
+; FLATSCR-NEXT:    v_readlane_b32 s49, v39, 2
+; FLATSCR-NEXT:    v_readlane_b32 s48, v39, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s39, v39, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
@@ -2037,149 +1517,83 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v40, s4, 32
 ; MUBUF-NEXT:    v_writelane_b32 v40, s39, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s40, 1
-; MUBUF-NEXT:    v_writelane_b32 v40, s41, 2
-; MUBUF-NEXT:    v_writelane_b32 v40, s42, 3
-; MUBUF-NEXT:    v_writelane_b32 v40, s43, 4
-; MUBUF-NEXT:    v_writelane_b32 v40, s44, 5
-; MUBUF-NEXT:    v_writelane_b32 v40, s45, 6
-; MUBUF-NEXT:    v_writelane_b32 v40, s46, 7
-; MUBUF-NEXT:    v_writelane_b32 v40, s47, 8
-; MUBUF-NEXT:    v_writelane_b32 v40, s48, 9
-; MUBUF-NEXT:    v_writelane_b32 v40, s49, 10
-; MUBUF-NEXT:    v_writelane_b32 v40, s50, 11
-; MUBUF-NEXT:    v_writelane_b32 v40, s51, 12
-; MUBUF-NEXT:    v_writelane_b32 v40, s52, 13
-; MUBUF-NEXT:    v_writelane_b32 v40, s53, 14
-; MUBUF-NEXT:    v_writelane_b32 v40, s54, 15
-; MUBUF-NEXT:    v_writelane_b32 v40, s55, 16
-; MUBUF-NEXT:    v_writelane_b32 v40, s56, 17
-; MUBUF-NEXT:    v_writelane_b32 v40, s57, 18
-; MUBUF-NEXT:    v_writelane_b32 v40, s58, 19
-; MUBUF-NEXT:    v_writelane_b32 v40, s59, 20
-; MUBUF-NEXT:    v_writelane_b32 v40, s60, 21
-; MUBUF-NEXT:    v_writelane_b32 v40, s61, 22
-; MUBUF-NEXT:    v_writelane_b32 v40, s62, 23
-; MUBUF-NEXT:    v_writelane_b32 v40, s63, 24
-; MUBUF-NEXT:    v_writelane_b32 v40, s64, 25
-; MUBUF-NEXT:    v_writelane_b32 v40, s65, 26
-; MUBUF-NEXT:    v_writelane_b32 v40, s66, 27
-; MUBUF-NEXT:    v_writelane_b32 v40, s67, 28
-; MUBUF-NEXT:    v_writelane_b32 v40, s68, 29
-; MUBUF-NEXT:    v_writelane_b32 v40, s69, 30
-; MUBUF-NEXT:    v_writelane_b32 v40, s70, 31
-; MUBUF-NEXT:    v_writelane_b32 v40, s71, 32
-; MUBUF-NEXT:    v_writelane_b32 v40, s72, 33
-; MUBUF-NEXT:    v_writelane_b32 v40, s73, 34
-; MUBUF-NEXT:    v_writelane_b32 v40, s74, 35
-; MUBUF-NEXT:    v_writelane_b32 v40, s75, 36
-; MUBUF-NEXT:    v_writelane_b32 v40, s76, 37
-; MUBUF-NEXT:    v_writelane_b32 v40, s77, 38
-; MUBUF-NEXT:    v_writelane_b32 v40, s78, 39
-; MUBUF-NEXT:    v_writelane_b32 v40, s79, 40
-; MUBUF-NEXT:    v_writelane_b32 v40, s80, 41
-; MUBUF-NEXT:    v_writelane_b32 v40, s81, 42
-; MUBUF-NEXT:    v_writelane_b32 v40, s82, 43
-; MUBUF-NEXT:    v_writelane_b32 v40, s83, 44
-; MUBUF-NEXT:    v_writelane_b32 v40, s84, 45
-; MUBUF-NEXT:    v_writelane_b32 v40, s85, 46
-; MUBUF-NEXT:    v_writelane_b32 v40, s86, 47
-; MUBUF-NEXT:    v_writelane_b32 v40, s87, 48
-; MUBUF-NEXT:    v_writelane_b32 v40, s88, 49
-; MUBUF-NEXT:    v_writelane_b32 v40, s89, 50
-; MUBUF-NEXT:    v_writelane_b32 v40, s90, 51
-; MUBUF-NEXT:    v_writelane_b32 v40, s91, 52
-; MUBUF-NEXT:    v_writelane_b32 v40, s92, 53
-; MUBUF-NEXT:    v_writelane_b32 v40, s93, 54
-; MUBUF-NEXT:    v_writelane_b32 v40, s94, 55
-; MUBUF-NEXT:    v_writelane_b32 v40, s95, 56
-; MUBUF-NEXT:    v_writelane_b32 v40, s96, 57
-; MUBUF-NEXT:    v_writelane_b32 v40, s97, 58
-; MUBUF-NEXT:    v_writelane_b32 v40, s98, 59
-; MUBUF-NEXT:    v_writelane_b32 v40, s99, 60
-; MUBUF-NEXT:    v_writelane_b32 v40, s100, 61
-; MUBUF-NEXT:    v_writelane_b32 v40, s101, 62
-; MUBUF-NEXT:    v_writelane_b32 v41, s4, 0
-; MUBUF-NEXT:    s_addk_i32 s32, 0x300
-; MUBUF-NEXT:    v_writelane_b32 v40, s102, 63
+; MUBUF-NEXT:    v_writelane_b32 v40, s48, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s49, 2
+; MUBUF-NEXT:    v_writelane_b32 v40, s50, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s51, 4
+; MUBUF-NEXT:    v_writelane_b32 v40, s52, 5
+; MUBUF-NEXT:    v_writelane_b32 v40, s53, 6
+; MUBUF-NEXT:    v_writelane_b32 v40, s54, 7
+; MUBUF-NEXT:    v_writelane_b32 v40, s55, 8
+; MUBUF-NEXT:    v_writelane_b32 v40, s64, 9
+; MUBUF-NEXT:    v_writelane_b32 v40, s65, 10
+; MUBUF-NEXT:    v_writelane_b32 v40, s66, 11
+; MUBUF-NEXT:    v_writelane_b32 v40, s67, 12
+; MUBUF-NEXT:    v_writelane_b32 v40, s68, 13
+; MUBUF-NEXT:    v_writelane_b32 v40, s69, 14
+; MUBUF-NEXT:    v_writelane_b32 v40, s70, 15
+; MUBUF-NEXT:    v_writelane_b32 v40, s71, 16
+; MUBUF-NEXT:    v_writelane_b32 v40, s80, 17
+; MUBUF-NEXT:    v_writelane_b32 v40, s81, 18
+; MUBUF-NEXT:    v_writelane_b32 v40, s82, 19
+; MUBUF-NEXT:    v_writelane_b32 v40, s83, 20
+; MUBUF-NEXT:    v_writelane_b32 v40, s84, 21
+; MUBUF-NEXT:    v_writelane_b32 v40, s85, 22
+; MUBUF-NEXT:    v_writelane_b32 v40, s86, 23
+; MUBUF-NEXT:    v_writelane_b32 v40, s87, 24
+; MUBUF-NEXT:    v_writelane_b32 v40, s96, 25
+; MUBUF-NEXT:    v_writelane_b32 v40, s97, 26
+; MUBUF-NEXT:    v_writelane_b32 v40, s98, 27
+; MUBUF-NEXT:    v_writelane_b32 v40, s99, 28
+; MUBUF-NEXT:    v_writelane_b32 v40, s100, 29
+; MUBUF-NEXT:    v_writelane_b32 v40, s101, 30
+; MUBUF-NEXT:    s_addk_i32 s32, 0x200
+; MUBUF-NEXT:    v_writelane_b32 v40, s102, 31
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber all VGPRs except CSR v40
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s102, v40, 63
-; MUBUF-NEXT:    v_readlane_b32 s101, v40, 62
-; MUBUF-NEXT:    v_readlane_b32 s100, v40, 61
-; MUBUF-NEXT:    v_readlane_b32 s99, v40, 60
-; MUBUF-NEXT:    v_readlane_b32 s98, v40, 59
-; MUBUF-NEXT:    v_readlane_b32 s97, v40, 58
-; MUBUF-NEXT:    v_readlane_b32 s96, v40, 57
-; MUBUF-NEXT:    v_readlane_b32 s95, v40, 56
-; MUBUF-NEXT:    v_readlane_b32 s94, v40, 55
-; MUBUF-NEXT:    v_readlane_b32 s93, v40, 54
-; MUBUF-NEXT:    v_readlane_b32 s92, v40, 53
-; MUBUF-NEXT:    v_readlane_b32 s91, v40, 52
-; MUBUF-NEXT:    v_readlane_b32 s90, v40, 51
-; MUBUF-NEXT:    v_readlane_b32 s89, v40, 50
-; MUBUF-NEXT:    v_readlane_b32 s88, v40, 49
-; MUBUF-NEXT:    v_readlane_b32 s87, v40, 48
-; MUBUF-NEXT:    v_readlane_b32 s86, v40, 47
-; MUBUF-NEXT:    v_readlane_b32 s85, v40, 46
-; MUBUF-NEXT:    v_readlane_b32 s84, v40, 45
-; MUBUF-NEXT:    v_readlane_b32 s83, v40, 44
-; MUBUF-NEXT:    v_readlane_b32 s82, v40, 43
-; MUBUF-NEXT:    v_readlane_b32 s81, v40, 42
-; MUBUF-NEXT:    v_readlane_b32 s80, v40, 41
-; MUBUF-NEXT:    v_readlane_b32 s79, v40, 40
-; MUBUF-NEXT:    v_readlane_b32 s78, v40, 39
-; MUBUF-NEXT:    v_readlane_b32 s77, v40, 38
-; MUBUF-NEXT:    v_readlane_b32 s76, v40, 37
-; MUBUF-NEXT:    v_readlane_b32 s75, v40, 36
-; MUBUF-NEXT:    v_readlane_b32 s74, v40, 35
-; MUBUF-NEXT:    v_readlane_b32 s73, v40, 34
-; MUBUF-NEXT:    v_readlane_b32 s72, v40, 33
-; MUBUF-NEXT:    v_readlane_b32 s71, v40, 32
-; MUBUF-NEXT:    v_readlane_b32 s70, v40, 31
-; MUBUF-NEXT:    v_readlane_b32 s69, v40, 30
-; MUBUF-NEXT:    v_readlane_b32 s68, v40, 29
-; MUBUF-NEXT:    v_readlane_b32 s67, v40, 28
-; MUBUF-NEXT:    v_readlane_b32 s66, v40, 27
-; MUBUF-NEXT:    v_readlane_b32 s65, v40, 26
-; MUBUF-NEXT:    v_readlane_b32 s64, v40, 25
-; MUBUF-NEXT:    v_readlane_b32 s63, v40, 24
-; MUBUF-NEXT:    v_readlane_b32 s62, v40, 23
-; MUBUF-NEXT:    v_readlane_b32 s61, v40, 22
-; MUBUF-NEXT:    v_readlane_b32 s60, v40, 21
-; MUBUF-NEXT:    v_readlane_b32 s59, v40, 20
-; MUBUF-NEXT:    v_readlane_b32 s58, v40, 19
-; MUBUF-NEXT:    v_readlane_b32 s57, v40, 18
-; MUBUF-NEXT:    v_readlane_b32 s56, v40, 17
-; MUBUF-NEXT:    v_readlane_b32 s55, v40, 16
-; MUBUF-NEXT:    v_readlane_b32 s54, v40, 15
-; MUBUF-NEXT:    v_readlane_b32 s53, v40, 14
-; MUBUF-NEXT:    v_readlane_b32 s52, v40, 13
-; MUBUF-NEXT:    v_readlane_b32 s51, v40, 12
-; MUBUF-NEXT:    v_readlane_b32 s50, v40, 11
-; MUBUF-NEXT:    v_readlane_b32 s49, v40, 10
-; MUBUF-NEXT:    v_readlane_b32 s48, v40, 9
-; MUBUF-NEXT:    v_readlane_b32 s47, v40, 8
-; MUBUF-NEXT:    v_readlane_b32 s46, v40, 7
-; MUBUF-NEXT:    v_readlane_b32 s45, v40, 6
-; MUBUF-NEXT:    v_readlane_b32 s44, v40, 5
-; MUBUF-NEXT:    v_readlane_b32 s43, v40, 4
-; MUBUF-NEXT:    v_readlane_b32 s42, v40, 3
-; MUBUF-NEXT:    v_readlane_b32 s41, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s40, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s102, v40, 31
+; MUBUF-NEXT:    v_readlane_b32 s101, v40, 30
+; MUBUF-NEXT:    v_readlane_b32 s100, v40, 29
+; MUBUF-NEXT:    v_readlane_b32 s99, v40, 28
+; MUBUF-NEXT:    v_readlane_b32 s98, v40, 27
+; MUBUF-NEXT:    v_readlane_b32 s97, v40, 26
+; MUBUF-NEXT:    v_readlane_b32 s96, v40, 25
+; MUBUF-NEXT:    v_readlane_b32 s87, v40, 24
+; MUBUF-NEXT:    v_readlane_b32 s86, v40, 23
+; MUBUF-NEXT:    v_readlane_b32 s85, v40, 22
+; MUBUF-NEXT:    v_readlane_b32 s84, v40, 21
+; MUBUF-NEXT:    v_readlane_b32 s83, v40, 20
+; MUBUF-NEXT:    v_readlane_b32 s82, v40, 19
+; MUBUF-NEXT:    v_readlane_b32 s81, v40, 18
+; MUBUF-NEXT:    v_readlane_b32 s80, v40, 17
+; MUBUF-NEXT:    v_readlane_b32 s71, v40, 16
+; MUBUF-NEXT:    v_readlane_b32 s70, v40, 15
+; MUBUF-NEXT:    v_readlane_b32 s69, v40, 14
+; MUBUF-NEXT:    v_readlane_b32 s68, v40, 13
+; MUBUF-NEXT:    v_readlane_b32 s67, v40, 12
+; MUBUF-NEXT:    v_readlane_b32 s66, v40, 11
+; MUBUF-NEXT:    v_readlane_b32 s65, v40, 10
+; MUBUF-NEXT:    v_readlane_b32 s64, v40, 9
+; MUBUF-NEXT:    v_readlane_b32 s55, v40, 8
+; MUBUF-NEXT:    v_readlane_b32 s54, v40, 7
+; MUBUF-NEXT:    v_readlane_b32 s53, v40, 6
+; MUBUF-NEXT:    v_readlane_b32 s52, v40, 5
+; MUBUF-NEXT:    v_readlane_b32 s51, v40, 4
+; MUBUF-NEXT:    v_readlane_b32 s50, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s49, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s48, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s39, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
-; MUBUF-NEXT:    v_readlane_b32 s4, v41, 0
+; MUBUF-NEXT:    v_readlane_b32 s4, v40, 32
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    s_mov_b32 s33, s4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -2194,139 +1608,75 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s40, 1
-; FLATSCR-NEXT:    v_writelane_b32 v40, s41, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s42, 3
-; FLATSCR-NEXT:    v_writelane_b32 v40, s43, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s44, 5
-; FLATSCR-NEXT:    v_writelane_b32 v40, s45, 6
-; FLATSCR-NEXT:    v_writelane_b32 v40, s46, 7
-; FLATSCR-NEXT:    v_writelane_b32 v40, s47, 8
-; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 9
-; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 10
-; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 11
-; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 12
-; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 13
-; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 14
-; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 15
-; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s56, 17
-; FLATSCR-NEXT:    v_writelane_b32 v40, s57, 18
-; FLATSCR-NEXT:    v_writelane_b32 v40, s58, 19
-; FLATSCR-NEXT:    v_writelane_b32 v40, s59, 20
-; FLATSCR-NEXT:    v_writelane_b32 v40, s60, 21
-; FLATSCR-NEXT:    v_writelane_b32 v40, s61, 22
-; FLATSCR-NEXT:    v_writelane_b32 v40, s62, 23
-; FLATSCR-NEXT:    v_writelane_b32 v40, s63, 24
-; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 25
-; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 26
-; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 27
-; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 28
-; FLATSCR-NEXT:    v_writelane_b32 v40, s68, 29
-; FLATSCR-NEXT:    v_writelane_b32 v40, s69, 30
-; FLATSCR-NEXT:    v_writelane_b32 v40, s70, 31
-; FLATSCR-NEXT:    v_writelane_b32 v40, s71, 32
-; FLATSCR-NEXT:    v_writelane_b32 v40, s72, 33
-; FLATSCR-NEXT:    v_writelane_b32 v40, s73, 34
-; FLATSCR-NEXT:    v_writelane_b32 v40, s74, 35
-; FLATSCR-NEXT:    v_writelane_b32 v40, s75, 36
-; FLATSCR-NEXT:    v_writelane_b32 v40, s76, 37
-; FLATSCR-NEXT:    v_writelane_b32 v40, s77, 38
-; FLATSCR-NEXT:    v_writelane_b32 v40, s78, 39
-; FLATSCR-NEXT:    v_writelane_b32 v40, s79, 40
-; FLATSCR-NEXT:    v_writelane_b32 v40, s80, 41
-; FLATSCR-NEXT:    v_writelane_b32 v40, s81, 42
-; FLATSCR-NEXT:    v_writelane_b32 v40, s82, 43
-; FLATSCR-NEXT:    v_writelane_b32 v40, s83, 44
-; FLATSCR-NEXT:    v_writelane_b32 v40, s84, 45
-; FLATSCR-NEXT:    v_writelane_b32 v40, s85, 46
-; FLATSCR-NEXT:    v_writelane_b32 v40, s86, 47
-; FLATSCR-NEXT:    v_writelane_b32 v40, s87, 48
-; FLATSCR-NEXT:    v_writelane_b32 v40, s88, 49
-; FLATSCR-NEXT:    v_writelane_b32 v40, s89, 50
-; FLATSCR-NEXT:    v_writelane_b32 v40, s90, 51
-; FLATSCR-NEXT:    v_writelane_b32 v40, s91, 52
-; FLATSCR-NEXT:    v_writelane_b32 v40, s92, 53
-; FLATSCR-NEXT:    v_writelane_b32 v40, s93, 54
-; FLATSCR-NEXT:    v_writelane_b32 v40, s94, 55
-; FLATSCR-NEXT:    v_writelane_b32 v40, s95, 56
-; FLATSCR-NEXT:    v_writelane_b32 v40, s96, 57
-; FLATSCR-NEXT:    v_writelane_b32 v40, s97, 58
-; FLATSCR-NEXT:    v_writelane_b32 v40, s98, 59
-; FLATSCR-NEXT:    v_writelane_b32 v40, s99, 60
-; FLATSCR-NEXT:    v_writelane_b32 v40, s100, 61
-; FLATSCR-NEXT:    v_writelane_b32 v40, s101, 62
+; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 4
+; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 5
+; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 6
+; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 7
+; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 9
+; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 10
+; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 11
+; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s68, 13
+; FLATSCR-NEXT:    v_writelane_b32 v40, s69, 14
+; FLATSCR-NEXT:    v_writelane_b32 v40, s70, 15
+; FLATSCR-NEXT:    v_writelane_b32 v40, s71, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s80, 17
+; FLATSCR-NEXT:    v_writelane_b32 v40, s81, 18
+; FLATSCR-NEXT:    v_writelane_b32 v40, s82, 19
+; FLATSCR-NEXT:    v_writelane_b32 v40, s83, 20
+; FLATSCR-NEXT:    v_writelane_b32 v40, s84, 21
+; FLATSCR-NEXT:    v_writelane_b32 v40, s85, 22
+; FLATSCR-NEXT:    v_writelane_b32 v40, s86, 23
+; FLATSCR-NEXT:    v_writelane_b32 v40, s87, 24
+; FLATSCR-NEXT:    v_writelane_b32 v40, s96, 25
+; FLATSCR-NEXT:    v_writelane_b32 v40, s97, 26
+; FLATSCR-NEXT:    v_writelane_b32 v40, s98, 27
+; FLATSCR-NEXT:    v_writelane_b32 v40, s99, 28
+; FLATSCR-NEXT:    v_writelane_b32 v40, s100, 29
+; FLATSCR-NEXT:    v_writelane_b32 v40, s101, 30
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 8
-; FLATSCR-NEXT:    v_writelane_b32 v40, s102, 63
+; FLATSCR-NEXT:    v_writelane_b32 v40, s102, 31
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber all VGPRs except CSR v40
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s102, v40, 63
-; FLATSCR-NEXT:    v_readlane_b32 s101, v40, 62
-; FLATSCR-NEXT:    v_readlane_b32 s100, v40, 61
-; FLATSCR-NEXT:    v_readlane_b32 s99, v40, 60
-; FLATSCR-NEXT:    v_readlane_b32 s98, v40, 59
-; FLATSCR-NEXT:    v_readlane_b32 s97, v40, 58
-; FLATSCR-NEXT:    v_readlane_b32 s96, v40, 57
-; FLATSCR-NEXT:    v_readlane_b32 s95, v40, 56
-; FLATSCR-NEXT:    v_readlane_b32 s94, v40, 55
-; FLATSCR-NEXT:    v_readlane_b32 s93, v40, 54
-; FLATSCR-NEXT:    v_readlane_b32 s92, v40, 53
-; FLATSCR-NEXT:    v_readlane_b32 s91, v40, 52
-; FLATSCR-NEXT:    v_readlane_b32 s90, v40, 51
-; FLATSCR-NEXT:    v_readlane_b32 s89, v40, 50
-; FLATSCR-NEXT:    v_readlane_b32 s88, v40, 49
-; FLATSCR-NEXT:    v_readlane_b32 s87, v40, 48
-; FLATSCR-NEXT:    v_readlane_b32 s86, v40, 47
-; FLATSCR-NEXT:    v_readlane_b32 s85, v40, 46
-; FLATSCR-NEXT:    v_readlane_b32 s84, v40, 45
-; FLATSCR-NEXT:    v_readlane_b32 s83, v40, 44
-; FLATSCR-NEXT:    v_readlane_b32 s82, v40, 43
-; FLATSCR-NEXT:    v_readlane_b32 s81, v40, 42
-; FLATSCR-NEXT:    v_readlane_b32 s80, v40, 41
-; FLATSCR-NEXT:    v_readlane_b32 s79, v40, 40
-; FLATSCR-NEXT:    v_readlane_b32 s78, v40, 39
-; FLATSCR-NEXT:    v_readlane_b32 s77, v40, 38
-; FLATSCR-NEXT:    v_readlane_b32 s76, v40, 37
-; FLATSCR-NEXT:    v_readlane_b32 s75, v40, 36
-; FLATSCR-NEXT:    v_readlane_b32 s74, v40, 35
-; FLATSCR-NEXT:    v_readlane_b32 s73, v40, 34
-; FLATSCR-NEXT:    v_readlane_b32 s72, v40, 33
-; FLATSCR-NEXT:    v_readlane_b32 s71, v40, 32
-; FLATSCR-NEXT:    v_readlane_b32 s70, v40, 31
-; FLATSCR-NEXT:    v_readlane_b32 s69, v40, 30
-; FLATSCR-NEXT:    v_readlane_b32 s68, v40, 29
-; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 28
-; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 27
-; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 26
-; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 25
-; FLATSCR-NEXT:    v_readlane_b32 s63, v40, 24
-; FLATSCR-NEXT:    v_readlane_b32 s62, v40, 23
-; FLATSCR-NEXT:    v_readlane_b32 s61, v40, 22
-; FLATSCR-NEXT:    v_readlane_b32 s60, v40, 21
-; FLATSCR-NEXT:    v_readlane_b32 s59, v40, 20
-; FLATSCR-NEXT:    v_readlane_b32 s58, v40, 19
-; FLATSCR-NEXT:    v_readlane_b32 s57, v40, 18
-; FLATSCR-NEXT:    v_readlane_b32 s56, v40, 17
-; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 16
-; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 15
-; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 14
-; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 13
-; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 12
-; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 11
-; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 10
-; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 9
-; FLATSCR-NEXT:    v_readlane_b32 s47, v40, 8
-; FLATSCR-NEXT:    v_readlane_b32 s46, v40, 7
-; FLATSCR-NEXT:    v_readlane_b32 s45, v40, 6
-; FLATSCR-NEXT:    v_readlane_b32 s44, v40, 5
-; FLATSCR-NEXT:    v_readlane_b32 s43, v40, 4
-; FLATSCR-NEXT:    v_readlane_b32 s42, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s41, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s40, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s102, v40, 31
+; FLATSCR-NEXT:    v_readlane_b32 s101, v40, 30
+; FLATSCR-NEXT:    v_readlane_b32 s100, v40, 29
+; FLATSCR-NEXT:    v_readlane_b32 s99, v40, 28
+; FLATSCR-NEXT:    v_readlane_b32 s98, v40, 27
+; FLATSCR-NEXT:    v_readlane_b32 s97, v40, 26
+; FLATSCR-NEXT:    v_readlane_b32 s96, v40, 25
+; FLATSCR-NEXT:    v_readlane_b32 s87, v40, 24
+; FLATSCR-NEXT:    v_readlane_b32 s86, v40, 23
+; FLATSCR-NEXT:    v_readlane_b32 s85, v40, 22
+; FLATSCR-NEXT:    v_readlane_b32 s84, v40, 21
+; FLATSCR-NEXT:    v_readlane_b32 s83, v40, 20
+; FLATSCR-NEXT:    v_readlane_b32 s82, v40, 19
+; FLATSCR-NEXT:    v_readlane_b32 s81, v40, 18
+; FLATSCR-NEXT:    v_readlane_b32 s80, v40, 17
+; FLATSCR-NEXT:    v_readlane_b32 s71, v40, 16
+; FLATSCR-NEXT:    v_readlane_b32 s70, v40, 15
+; FLATSCR-NEXT:    v_readlane_b32 s69, v40, 14
+; FLATSCR-NEXT:    v_readlane_b32 s68, v40, 13
+; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 12
+; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 11
+; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 10
+; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 9
+; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 8
+; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 7
+; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 6
+; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 5
+; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 4
+; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -2367,75 +1717,42 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5)
 ; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40100
 ; MUBUF-NEXT:    buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v39, s4, 32
 ; MUBUF-NEXT:    v_writelane_b32 v39, s39, 0
-; MUBUF-NEXT:    v_writelane_b32 v39, s40, 1
-; MUBUF-NEXT:    v_writelane_b32 v39, s41, 2
-; MUBUF-NEXT:    v_writelane_b32 v39, s42, 3
-; MUBUF-NEXT:    v_writelane_b32 v39, s43, 4
-; MUBUF-NEXT:    v_writelane_b32 v39, s44, 5
-; MUBUF-NEXT:    v_writelane_b32 v39, s45, 6
-; MUBUF-NEXT:    v_writelane_b32 v39, s46, 7
-; MUBUF-NEXT:    v_writelane_b32 v39, s47, 8
-; MUBUF-NEXT:    v_writelane_b32 v39, s48, 9
-; MUBUF-NEXT:    v_writelane_b32 v39, s49, 10
-; MUBUF-NEXT:    v_writelane_b32 v39, s50, 11
-; MUBUF-NEXT:    v_writelane_b32 v39, s51, 12
-; MUBUF-NEXT:    v_writelane_b32 v39, s52, 13
-; MUBUF-NEXT:    v_writelane_b32 v39, s53, 14
-; MUBUF-NEXT:    v_writelane_b32 v39, s54, 15
-; MUBUF-NEXT:    v_writelane_b32 v39, s55, 16
-; MUBUF-NEXT:    v_writelane_b32 v39, s56, 17
-; MUBUF-NEXT:    v_writelane_b32 v39, s57, 18
-; MUBUF-NEXT:    v_writelane_b32 v39, s58, 19
-; MUBUF-NEXT:    v_writelane_b32 v39, s59, 20
-; MUBUF-NEXT:    v_writelane_b32 v39, s60, 21
-; MUBUF-NEXT:    v_writelane_b32 v39, s61, 22
-; MUBUF-NEXT:    v_writelane_b32 v39, s62, 23
-; MUBUF-NEXT:    v_writelane_b32 v39, s63, 24
-; MUBUF-NEXT:    v_writelane_b32 v39, s64, 25
-; MUBUF-NEXT:    v_writelane_b32 v39, s65, 26
-; MUBUF-NEXT:    v_writelane_b32 v39, s66, 27
-; MUBUF-NEXT:    v_writelane_b32 v39, s67, 28
-; MUBUF-NEXT:    v_writelane_b32 v39, s68, 29
-; MUBUF-NEXT:    v_writelane_b32 v39, s69, 30
-; MUBUF-NEXT:    v_writelane_b32 v39, s70, 31
-; MUBUF-NEXT:    v_writelane_b32 v39, s71, 32
-; MUBUF-NEXT:    v_writelane_b32 v39, s72, 33
-; MUBUF-NEXT:    v_writelane_b32 v39, s73, 34
-; MUBUF-NEXT:    v_writelane_b32 v39, s74, 35
-; MUBUF-NEXT:    v_writelane_b32 v39, s75, 36
-; MUBUF-NEXT:    v_writelane_b32 v39, s76, 37
-; MUBUF-NEXT:    v_writelane_b32 v39, s77, 38
-; MUBUF-NEXT:    v_writelane_b32 v39, s78, 39
-; MUBUF-NEXT:    v_writelane_b32 v39, s79, 40
-; MUBUF-NEXT:    v_writelane_b32 v39, s80, 41
-; MUBUF-NEXT:    v_writelane_b32 v39, s81, 42
-; MUBUF-NEXT:    v_writelane_b32 v39, s82, 43
-; MUBUF-NEXT:    v_writelane_b32 v39, s83, 44
-; MUBUF-NEXT:    v_writelane_b32 v39, s84, 45
-; MUBUF-NEXT:    v_writelane_b32 v39, s85, 46
-; MUBUF-NEXT:    v_writelane_b32 v39, s86, 47
-; MUBUF-NEXT:    v_writelane_b32 v39, s87, 48
-; MUBUF-NEXT:    v_writelane_b32 v39, s88, 49
-; MUBUF-NEXT:    v_writelane_b32 v39, s89, 50
-; MUBUF-NEXT:    v_writelane_b32 v39, s90, 51
-; MUBUF-NEXT:    v_writelane_b32 v39, s91, 52
-; MUBUF-NEXT:    v_writelane_b32 v39, s92, 53
-; MUBUF-NEXT:    v_writelane_b32 v39, s93, 54
-; MUBUF-NEXT:    v_writelane_b32 v39, s94, 55
-; MUBUF-NEXT:    v_writelane_b32 v39, s95, 56
-; MUBUF-NEXT:    v_writelane_b32 v39, s96, 57
-; MUBUF-NEXT:    v_writelane_b32 v39, s97, 58
-; MUBUF-NEXT:    v_writelane_b32 v39, s98, 59
-; MUBUF-NEXT:    v_writelane_b32 v39, s99, 60
-; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
-; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40200
-; MUBUF-NEXT:    v_writelane_b32 v39, s100, 61
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; MUBUF-NEXT:    v_writelane_b32 v39, s101, 62
+; MUBUF-NEXT:    v_writelane_b32 v39, s48, 1
+; MUBUF-NEXT:    v_writelane_b32 v39, s49, 2
+; MUBUF-NEXT:    v_writelane_b32 v39, s50, 3
+; MUBUF-NEXT:    v_writelane_b32 v39, s51, 4
+; MUBUF-NEXT:    v_writelane_b32 v39, s52, 5
+; MUBUF-NEXT:    v_writelane_b32 v39, s53, 6
+; MUBUF-NEXT:    v_writelane_b32 v39, s54, 7
+; MUBUF-NEXT:    v_writelane_b32 v39, s55, 8
+; MUBUF-NEXT:    v_writelane_b32 v39, s64, 9
+; MUBUF-NEXT:    v_writelane_b32 v39, s65, 10
+; MUBUF-NEXT:    v_writelane_b32 v39, s66, 11
+; MUBUF-NEXT:    v_writelane_b32 v39, s67, 12
+; MUBUF-NEXT:    v_writelane_b32 v39, s68, 13
+; MUBUF-NEXT:    v_writelane_b32 v39, s69, 14
+; MUBUF-NEXT:    v_writelane_b32 v39, s70, 15
+; MUBUF-NEXT:    v_writelane_b32 v39, s71, 16
+; MUBUF-NEXT:    v_writelane_b32 v39, s80, 17
+; MUBUF-NEXT:    v_writelane_b32 v39, s81, 18
+; MUBUF-NEXT:    v_writelane_b32 v39, s82, 19
+; MUBUF-NEXT:    v_writelane_b32 v39, s83, 20
+; MUBUF-NEXT:    v_writelane_b32 v39, s84, 21
+; MUBUF-NEXT:    v_writelane_b32 v39, s85, 22
+; MUBUF-NEXT:    v_writelane_b32 v39, s86, 23
+; MUBUF-NEXT:    v_writelane_b32 v39, s87, 24
+; MUBUF-NEXT:    v_writelane_b32 v39, s96, 25
+; MUBUF-NEXT:    v_writelane_b32 v39, s97, 26
+; MUBUF-NEXT:    v_writelane_b32 v39, s98, 27
+; MUBUF-NEXT:    v_writelane_b32 v39, s99, 28
+; MUBUF-NEXT:    v_writelane_b32 v39, s100, 29
+; MUBUF-NEXT:    v_writelane_b32 v39, s101, 30
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x1000
-; MUBUF-NEXT:    v_writelane_b32 v39, s102, 63
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40300
+; MUBUF-NEXT:    v_writelane_b32 v39, s102, 31
 ; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
@@ -2444,76 +1761,40 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber all VGPRs except CSR v40
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40200
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40400
-; MUBUF-NEXT:    v_readlane_b32 s102, v39, 63
-; MUBUF-NEXT:    v_readlane_b32 s101, v39, 62
-; MUBUF-NEXT:    v_readlane_b32 s100, v39, 61
-; MUBUF-NEXT:    v_readlane_b32 s99, v39, 60
-; MUBUF-NEXT:    v_readlane_b32 s98, v39, 59
-; MUBUF-NEXT:    v_readlane_b32 s97, v39, 58
-; MUBUF-NEXT:    v_readlane_b32 s96, v39, 57
-; MUBUF-NEXT:    v_readlane_b32 s95, v39, 56
-; MUBUF-NEXT:    v_readlane_b32 s94, v39, 55
-; MUBUF-NEXT:    v_readlane_b32 s93, v39, 54
-; MUBUF-NEXT:    v_readlane_b32 s92, v39, 53
-; MUBUF-NEXT:    v_readlane_b32 s91, v39, 52
-; MUBUF-NEXT:    v_readlane_b32 s90, v39, 51
-; MUBUF-NEXT:    v_readlane_b32 s89, v39, 50
-; MUBUF-NEXT:    v_readlane_b32 s88, v39, 49
-; MUBUF-NEXT:    v_readlane_b32 s87, v39, 48
-; MUBUF-NEXT:    v_readlane_b32 s86, v39, 47
-; MUBUF-NEXT:    v_readlane_b32 s85, v39, 46
-; MUBUF-NEXT:    v_readlane_b32 s84, v39, 45
-; MUBUF-NEXT:    v_readlane_b32 s83, v39, 44
-; MUBUF-NEXT:    v_readlane_b32 s82, v39, 43
-; MUBUF-NEXT:    v_readlane_b32 s81, v39, 42
-; MUBUF-NEXT:    v_readlane_b32 s80, v39, 41
-; MUBUF-NEXT:    v_readlane_b32 s79, v39, 40
-; MUBUF-NEXT:    v_readlane_b32 s78, v39, 39
-; MUBUF-NEXT:    v_readlane_b32 s77, v39, 38
-; MUBUF-NEXT:    v_readlane_b32 s76, v39, 37
-; MUBUF-NEXT:    v_readlane_b32 s75, v39, 36
-; MUBUF-NEXT:    v_readlane_b32 s74, v39, 35
-; MUBUF-NEXT:    v_readlane_b32 s73, v39, 34
-; MUBUF-NEXT:    v_readlane_b32 s72, v39, 33
-; MUBUF-NEXT:    v_readlane_b32 s71, v39, 32
-; MUBUF-NEXT:    v_readlane_b32 s70, v39, 31
-; MUBUF-NEXT:    v_readlane_b32 s69, v39, 30
-; MUBUF-NEXT:    v_readlane_b32 s68, v39, 29
-; MUBUF-NEXT:    v_readlane_b32 s67, v39, 28
-; MUBUF-NEXT:    v_readlane_b32 s66, v39, 27
-; MUBUF-NEXT:    v_readlane_b32 s65, v39, 26
-; MUBUF-NEXT:    v_readlane_b32 s64, v39, 25
-; MUBUF-NEXT:    v_readlane_b32 s63, v39, 24
-; MUBUF-NEXT:    v_readlane_b32 s62, v39, 23
-; MUBUF-NEXT:    v_readlane_b32 s61, v39, 22
-; MUBUF-NEXT:    v_readlane_b32 s60, v39, 21
-; MUBUF-NEXT:    v_readlane_b32 s59, v39, 20
-; MUBUF-NEXT:    v_readlane_b32 s58, v39, 19
-; MUBUF-NEXT:    v_readlane_b32 s57, v39, 18
-; MUBUF-NEXT:    v_readlane_b32 s56, v39, 17
-; MUBUF-NEXT:    v_readlane_b32 s55, v39, 16
-; MUBUF-NEXT:    v_readlane_b32 s54, v39, 15
-; MUBUF-NEXT:    v_readlane_b32 s53, v39, 14
-; MUBUF-NEXT:    v_readlane_b32 s52, v39, 13
-; MUBUF-NEXT:    v_readlane_b32 s51, v39, 12
-; MUBUF-NEXT:    v_readlane_b32 s50, v39, 11
-; MUBUF-NEXT:    v_readlane_b32 s49, v39, 10
-; MUBUF-NEXT:    v_readlane_b32 s48, v39, 9
-; MUBUF-NEXT:    v_readlane_b32 s47, v39, 8
-; MUBUF-NEXT:    v_readlane_b32 s46, v39, 7
-; MUBUF-NEXT:    v_readlane_b32 s45, v39, 6
-; MUBUF-NEXT:    v_readlane_b32 s44, v39, 5
-; MUBUF-NEXT:    v_readlane_b32 s43, v39, 4
-; MUBUF-NEXT:    v_readlane_b32 s42, v39, 3
-; MUBUF-NEXT:    v_readlane_b32 s41, v39, 2
-; MUBUF-NEXT:    v_readlane_b32 s40, v39, 1
+; MUBUF-NEXT:    v_readlane_b32 s102, v39, 31
+; MUBUF-NEXT:    v_readlane_b32 s101, v39, 30
+; MUBUF-NEXT:    v_readlane_b32 s100, v39, 29
+; MUBUF-NEXT:    v_readlane_b32 s99, v39, 28
+; MUBUF-NEXT:    v_readlane_b32 s98, v39, 27
+; MUBUF-NEXT:    v_readlane_b32 s97, v39, 26
+; MUBUF-NEXT:    v_readlane_b32 s96, v39, 25
+; MUBUF-NEXT:    v_readlane_b32 s87, v39, 24
+; MUBUF-NEXT:    v_readlane_b32 s86, v39, 23
+; MUBUF-NEXT:    v_readlane_b32 s85, v39, 22
+; MUBUF-NEXT:    v_readlane_b32 s84, v39, 21
+; MUBUF-NEXT:    v_readlane_b32 s83, v39, 20
+; MUBUF-NEXT:    v_readlane_b32 s82, v39, 19
+; MUBUF-NEXT:    v_readlane_b32 s81, v39, 18
+; MUBUF-NEXT:    v_readlane_b32 s80, v39, 17
+; MUBUF-NEXT:    v_readlane_b32 s71, v39, 16
+; MUBUF-NEXT:    v_readlane_b32 s70, v39, 15
+; MUBUF-NEXT:    v_readlane_b32 s69, v39, 14
+; MUBUF-NEXT:    v_readlane_b32 s68, v39, 13
+; MUBUF-NEXT:    v_readlane_b32 s67, v39, 12
+; MUBUF-NEXT:    v_readlane_b32 s66, v39, 11
+; MUBUF-NEXT:    v_readlane_b32 s65, v39, 10
+; MUBUF-NEXT:    v_readlane_b32 s64, v39, 9
+; MUBUF-NEXT:    v_readlane_b32 s55, v39, 8
+; MUBUF-NEXT:    v_readlane_b32 s54, v39, 7
+; MUBUF-NEXT:    v_readlane_b32 s53, v39, 6
+; MUBUF-NEXT:    v_readlane_b32 s52, v39, 5
+; MUBUF-NEXT:    v_readlane_b32 s51, v39, 4
+; MUBUF-NEXT:    v_readlane_b32 s50, v39, 3
+; MUBUF-NEXT:    v_readlane_b32 s49, v39, 2
+; MUBUF-NEXT:    v_readlane_b32 s48, v39, 1
 ; MUBUF-NEXT:    v_readlane_b32 s39, v39, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT:    v_readlane_b32 s4, v39, 32
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40100
 ; MUBUF-NEXT:    buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -2532,72 +1813,40 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5)
 ; FLATSCR-NEXT:    scratch_store_dword off, v39, s1 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v39, s39, 0
-; FLATSCR-NEXT:    v_writelane_b32 v39, s40, 1
-; FLATSCR-NEXT:    v_writelane_b32 v39, s41, 2
-; FLATSCR-NEXT:    v_writelane_b32 v39, s42, 3
-; FLATSCR-NEXT:    v_writelane_b32 v39, s43, 4
-; FLATSCR-NEXT:    v_writelane_b32 v39, s44, 5
-; FLATSCR-NEXT:    v_writelane_b32 v39, s45, 6
-; FLATSCR-NEXT:    v_writelane_b32 v39, s46, 7
-; FLATSCR-NEXT:    v_writelane_b32 v39, s47, 8
-; FLATSCR-NEXT:    v_writelane_b32 v39, s48, 9
-; FLATSCR-NEXT:    v_writelane_b32 v39, s49, 10
-; FLATSCR-NEXT:    v_writelane_b32 v39, s50, 11
-; FLATSCR-NEXT:    v_writelane_b32 v39, s51, 12
-; FLATSCR-NEXT:    v_writelane_b32 v39, s52, 13
-; FLATSCR-NEXT:    v_writelane_b32 v39, s53, 14
-; FLATSCR-NEXT:    v_writelane_b32 v39, s54, 15
-; FLATSCR-NEXT:    v_writelane_b32 v39, s55, 16
-; FLATSCR-NEXT:    v_writelane_b32 v39, s56, 17
-; FLATSCR-NEXT:    v_writelane_b32 v39, s57, 18
-; FLATSCR-NEXT:    v_writelane_b32 v39, s58, 19
-; FLATSCR-NEXT:    v_writelane_b32 v39, s59, 20
-; FLATSCR-NEXT:    v_writelane_b32 v39, s60, 21
-; FLATSCR-NEXT:    v_writelane_b32 v39, s61, 22
-; FLATSCR-NEXT:    v_writelane_b32 v39, s62, 23
-; FLATSCR-NEXT:    v_writelane_b32 v39, s63, 24
-; FLATSCR-NEXT:    v_writelane_b32 v39, s64, 25
-; FLATSCR-NEXT:    v_writelane_b32 v39, s65, 26
-; FLATSCR-NEXT:    v_writelane_b32 v39, s66, 27
-; FLATSCR-NEXT:    v_writelane_b32 v39, s67, 28
-; FLATSCR-NEXT:    v_writelane_b32 v39, s68, 29
-; FLATSCR-NEXT:    v_writelane_b32 v39, s69, 30
-; FLATSCR-NEXT:    v_writelane_b32 v39, s70, 31
-; FLATSCR-NEXT:    v_writelane_b32 v39, s71, 32
-; FLATSCR-NEXT:    v_writelane_b32 v39, s72, 33
-; FLATSCR-NEXT:    v_writelane_b32 v39, s73, 34
-; FLATSCR-NEXT:    v_writelane_b32 v39, s74, 35
-; FLATSCR-NEXT:    v_writelane_b32 v39, s75, 36
-; FLATSCR-NEXT:    v_writelane_b32 v39, s76, 37
-; FLATSCR-NEXT:    v_writelane_b32 v39, s77, 38
-; FLATSCR-NEXT:    v_writelane_b32 v39, s78, 39
-; FLATSCR-NEXT:    v_writelane_b32 v39, s79, 40
-; FLATSCR-NEXT:    v_writelane_b32 v39, s80, 41
-; FLATSCR-NEXT:    v_writelane_b32 v39, s81, 42
-; FLATSCR-NEXT:    v_writelane_b32 v39, s82, 43
-; FLATSCR-NEXT:    v_writelane_b32 v39, s83, 44
-; FLATSCR-NEXT:    v_writelane_b32 v39, s84, 45
-; FLATSCR-NEXT:    v_writelane_b32 v39, s85, 46
-; FLATSCR-NEXT:    v_writelane_b32 v39, s86, 47
-; FLATSCR-NEXT:    v_writelane_b32 v39, s87, 48
-; FLATSCR-NEXT:    v_writelane_b32 v39, s88, 49
-; FLATSCR-NEXT:    v_writelane_b32 v39, s89, 50
-; FLATSCR-NEXT:    v_writelane_b32 v39, s90, 51
-; FLATSCR-NEXT:    v_writelane_b32 v39, s91, 52
-; FLATSCR-NEXT:    v_writelane_b32 v39, s92, 53
-; FLATSCR-NEXT:    v_writelane_b32 v39, s93, 54
-; FLATSCR-NEXT:    v_writelane_b32 v39, s94, 55
-; FLATSCR-NEXT:    v_writelane_b32 v39, s95, 56
-; FLATSCR-NEXT:    v_writelane_b32 v39, s96, 57
-; FLATSCR-NEXT:    v_writelane_b32 v39, s97, 58
-; FLATSCR-NEXT:    v_writelane_b32 v39, s98, 59
-; FLATSCR-NEXT:    v_writelane_b32 v39, s99, 60
+; FLATSCR-NEXT:    v_writelane_b32 v39, s48, 1
+; FLATSCR-NEXT:    v_writelane_b32 v39, s49, 2
+; FLATSCR-NEXT:    v_writelane_b32 v39, s50, 3
+; FLATSCR-NEXT:    v_writelane_b32 v39, s51, 4
+; FLATSCR-NEXT:    v_writelane_b32 v39, s52, 5
+; FLATSCR-NEXT:    v_writelane_b32 v39, s53, 6
+; FLATSCR-NEXT:    v_writelane_b32 v39, s54, 7
+; FLATSCR-NEXT:    v_writelane_b32 v39, s55, 8
+; FLATSCR-NEXT:    v_writelane_b32 v39, s64, 9
+; FLATSCR-NEXT:    v_writelane_b32 v39, s65, 10
+; FLATSCR-NEXT:    v_writelane_b32 v39, s66, 11
+; FLATSCR-NEXT:    v_writelane_b32 v39, s67, 12
+; FLATSCR-NEXT:    v_writelane_b32 v39, s68, 13
+; FLATSCR-NEXT:    v_writelane_b32 v39, s69, 14
+; FLATSCR-NEXT:    v_writelane_b32 v39, s70, 15
+; FLATSCR-NEXT:    v_writelane_b32 v39, s71, 16
+; FLATSCR-NEXT:    v_writelane_b32 v39, s80, 17
+; FLATSCR-NEXT:    v_writelane_b32 v39, s81, 18
+; FLATSCR-NEXT:    v_writelane_b32 v39, s82, 19
+; FLATSCR-NEXT:    v_writelane_b32 v39, s83, 20
+; FLATSCR-NEXT:    v_writelane_b32 v39, s84, 21
+; FLATSCR-NEXT:    v_writelane_b32 v39, s85, 22
+; FLATSCR-NEXT:    v_writelane_b32 v39, s86, 23
+; FLATSCR-NEXT:    v_writelane_b32 v39, s87, 24
+; FLATSCR-NEXT:    v_writelane_b32 v39, s96, 25
+; FLATSCR-NEXT:    v_writelane_b32 v39, s97, 26
+; FLATSCR-NEXT:    v_writelane_b32 v39, s98, 27
+; FLATSCR-NEXT:    v_writelane_b32 v39, s99, 28
 ; FLATSCR-NEXT:    s_addk_i32 s32, 0x100c
-; FLATSCR-NEXT:    v_writelane_b32 v39, s100, 61
-; FLATSCR-NEXT:    v_writelane_b32 v39, s101, 62
+; FLATSCR-NEXT:    v_writelane_b32 v39, s100, 29
+; FLATSCR-NEXT:    v_writelane_b32 v39, s101, 30
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x1000
-; FLATSCR-NEXT:    v_writelane_b32 v39, s102, 63
+; FLATSCR-NEXT:    v_writelane_b32 v39, s102, 31
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s1
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
@@ -2606,69 +1855,37 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber all VGPRs except CSR v40
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s102, v39, 63
-; FLATSCR-NEXT:    v_readlane_b32 s101, v39, 62
-; FLATSCR-NEXT:    v_readlane_b32 s100, v39, 61
-; FLATSCR-NEXT:    v_readlane_b32 s99, v39, 60
-; FLATSCR-NEXT:    v_readlane_b32 s98, v39, 59
-; FLATSCR-NEXT:    v_readlane_b32 s97, v39, 58
-; FLATSCR-NEXT:    v_readlane_b32 s96, v39, 57
-; FLATSCR-NEXT:    v_readlane_b32 s95, v39, 56
-; FLATSCR-NEXT:    v_readlane_b32 s94, v39, 55
-; FLATSCR-NEXT:    v_readlane_b32 s93, v39, 54
-; FLATSCR-NEXT:    v_readlane_b32 s92, v39, 53
-; FLATSCR-NEXT:    v_readlane_b32 s91, v39, 52
-; FLATSCR-NEXT:    v_readlane_b32 s90, v39, 51
-; FLATSCR-NEXT:    v_readlane_b32 s89, v39, 50
-; FLATSCR-NEXT:    v_readlane_b32 s88, v39, 49
-; FLATSCR-NEXT:    v_readlane_b32 s87, v39, 48
-; FLATSCR-NEXT:    v_readlane_b32 s86, v39, 47
-; FLATSCR-NEXT:    v_readlane_b32 s85, v39, 46
-; FLATSCR-NEXT:    v_readlane_b32 s84, v39, 45
-; FLATSCR-NEXT:    v_readlane_b32 s83, v39, 44
-; FLATSCR-NEXT:    v_readlane_b32 s82, v39, 43
-; FLATSCR-NEXT:    v_readlane_b32 s81, v39, 42
-; FLATSCR-NEXT:    v_readlane_b32 s80, v39, 41
-; FLATSCR-NEXT:    v_readlane_b32 s79, v39, 40
-; FLATSCR-NEXT:    v_readlane_b32 s78, v39, 39
-; FLATSCR-NEXT:    v_readlane_b32 s77, v39, 38
-; FLATSCR-NEXT:    v_readlane_b32 s76, v39, 37
-; FLATSCR-NEXT:    v_readlane_b32 s75, v39, 36
-; FLATSCR-NEXT:    v_readlane_b32 s74, v39, 35
-; FLATSCR-NEXT:    v_readlane_b32 s73, v39, 34
-; FLATSCR-NEXT:    v_readlane_b32 s72, v39, 33
-; FLATSCR-NEXT:    v_readlane_b32 s71, v39, 32
-; FLATSCR-NEXT:    v_readlane_b32 s70, v39, 31
-; FLATSCR-NEXT:    v_readlane_b32 s69, v39, 30
-; FLATSCR-NEXT:    v_readlane_b32 s68, v39, 29
-; FLATSCR-NEXT:    v_readlane_b32 s67, v39, 28
-; FLATSCR-NEXT:    v_readlane_b32 s66, v39, 27
-; FLATSCR-NEXT:    v_readlane_b32 s65, v39, 26
-; FLATSCR-NEXT:    v_readlane_b32 s64, v39, 25
-; FLATSCR-NEXT:    v_readlane_b32 s63, v39, 24
-; FLATSCR-NEXT:    v_readlane_b32 s62, v39, 23
-; FLATSCR-NEXT:    v_readlane_b32 s61, v39, 22
-; FLATSCR-NEXT:    v_readlane_b32 s60, v39, 21
-; FLATSCR-NEXT:    v_readlane_b32 s59, v39, 20
-; FLATSCR-NEXT:    v_readlane_b32 s58, v39, 19
-; FLATSCR-NEXT:    v_readlane_b32 s57, v39, 18
-; FLATSCR-NEXT:    v_readlane_b32 s56, v39, 17
-; FLATSCR-NEXT:    v_readlane_b32 s55, v39, 16
-; FLATSCR-NEXT:    v_readlane_b32 s54, v39, 15
-; FLATSCR-NEXT:    v_readlane_b32 s53, v39, 14
-; FLATSCR-NEXT:    v_readlane_b32 s52, v39, 13
-; FLATSCR-NEXT:    v_readlane_b32 s51, v39, 12
-; FLATSCR-NEXT:    v_readlane_b32 s50, v39, 11
-; FLATSCR-NEXT:    v_readlane_b32 s49, v39, 10
-; FLATSCR-NEXT:    v_readlane_b32 s48, v39, 9
-; FLATSCR-NEXT:    v_readlane_b32 s47, v39, 8
-; FLATSCR-NEXT:    v_readlane_b32 s46, v39, 7
-; FLATSCR-NEXT:    v_readlane_b32 s45, v39, 6
-; FLATSCR-NEXT:    v_readlane_b32 s44, v39, 5
-; FLATSCR-NEXT:    v_readlane_b32 s43, v39, 4
-; FLATSCR-NEXT:    v_readlane_b32 s42, v39, 3
-; FLATSCR-NEXT:    v_readlane_b32 s41, v39, 2
-; FLATSCR-NEXT:    v_readlane_b32 s40, v39, 1
+; FLATSCR-NEXT:    v_readlane_b32 s102, v39, 31
+; FLATSCR-NEXT:    v_readlane_b32 s101, v39, 30
+; FLATSCR-NEXT:    v_readlane_b32 s100, v39, 29
+; FLATSCR-NEXT:    v_readlane_b32 s99, v39, 28
+; FLATSCR-NEXT:    v_readlane_b32 s98, v39, 27
+; FLATSCR-NEXT:    v_readlane_b32 s97, v39, 26
+; FLATSCR-NEXT:    v_readlane_b32 s96, v39, 25
+; FLATSCR-NEXT:    v_readlane_b32 s87, v39, 24
+; FLATSCR-NEXT:    v_readlane_b32 s86, v39, 23
+; FLATSCR-NEXT:    v_readlane_b32 s85, v39, 22
+; FLATSCR-NEXT:    v_readlane_b32 s84, v39, 21
+; FLATSCR-NEXT:    v_readlane_b32 s83, v39, 20
+; FLATSCR-NEXT:    v_readlane_b32 s82, v39, 19
+; FLATSCR-NEXT:    v_readlane_b32 s81, v39, 18
+; FLATSCR-NEXT:    v_readlane_b32 s80, v39, 17
+; FLATSCR-NEXT:    v_readlane_b32 s71, v39, 16
+; FLATSCR-NEXT:    v_readlane_b32 s70, v39, 15
+; FLATSCR-NEXT:    v_readlane_b32 s69, v39, 14
+; FLATSCR-NEXT:    v_readlane_b32 s68, v39, 13
+; FLATSCR-NEXT:    v_readlane_b32 s67, v39, 12
+; FLATSCR-NEXT:    v_readlane_b32 s66, v39, 11
+; FLATSCR-NEXT:    v_readlane_b32 s65, v39, 10
+; FLATSCR-NEXT:    v_readlane_b32 s64, v39, 9
+; FLATSCR-NEXT:    v_readlane_b32 s55, v39, 8
+; FLATSCR-NEXT:    v_readlane_b32 s54, v39, 7
+; FLATSCR-NEXT:    v_readlane_b32 s53, v39, 6
+; FLATSCR-NEXT:    v_readlane_b32 s52, v39, 5
+; FLATSCR-NEXT:    v_readlane_b32 s51, v39, 4
+; FLATSCR-NEXT:    v_readlane_b32 s50, v39, 3
+; FLATSCR-NEXT:    v_readlane_b32 s49, v39, 2
+; FLATSCR-NEXT:    v_readlane_b32 s48, v39, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s39, v39, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1

diff  --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
index a14d515688a8b..6504f48333485 100644
--- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
@@ -14,15 +14,7 @@ body: |
   ; CHECK-LABEL: name: def_csr_sgpr
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr42, $sgpr43, $sgpr46, $sgpr47
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-  ; CHECK-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
-  ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr42, 0, $vgpr0
-  ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr43, 1, $vgpr0
-  ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr46, 2, $vgpr0
-  ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr47, 3, $vgpr0
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 4d9c85ef99dcd..9b91a3dc9b6e4 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1321,19 +1321,19 @@ bb:
 define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspace(3) %arg) {
 ; CI-LABEL: ds_read_call_read:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_getpc_b64 s[40:41]
-; CI-NEXT:    s_mov_b32 s40, s0
-; CI-NEXT:    s_load_dwordx4 s[40:43], s[40:41], 0x0
+; CI-NEXT:    s_getpc_b64 s[48:49]
+; CI-NEXT:    s_mov_b32 s48, s0
+; CI-NEXT:    s_load_dwordx4 s[48:51], s[48:49], 0x0
 ; CI-NEXT:    s_mov_b32 s14, s10
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_mov_b32 s12, s8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s40, s40, s11
+; CI-NEXT:    s_add_u32 s48, s48, s11
 ; CI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; CI-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s6, s[4:5], 0x2
-; CI-NEXT:    s_addc_u32 s41, s41, 0
+; CI-NEXT:    s_addc_u32 s49, s49, 0
 ; CI-NEXT:    s_add_u32 s8, s4, 12
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; CI-NEXT:    s_mov_b32 s13, s9
@@ -1345,11 +1345,11 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; CI-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; CI-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; CI-NEXT:    s_mov_b32 s17, void_func_void at abs32@hi
 ; CI-NEXT:    s_mov_b32 s16, void_func_void at abs32@lo
 ; CI-NEXT:    v_or_b32_e32 v31, v0, v2
-; CI-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; CI-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_mov_b32 s39, 0xf000
 ; CI-NEXT:    s_mov_b32 s38, -1

diff  --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index 8b02bdbb70b7b..40cdfd76d6af6 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -30,59 +30,59 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    v_writelane_b32 v41, s37, 5
 ; CHECK-NEXT:    v_writelane_b32 v41, s38, 6
 ; CHECK-NEXT:    v_writelane_b32 v41, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v41, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v41, s41, 9
-; CHECK-NEXT:    v_writelane_b32 v41, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v41, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v41, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v41, s48, 8
+; CHECK-NEXT:    v_writelane_b32 v41, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v41, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v41, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v41, s52, 12
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v41, s45, 13
-; CHECK-NEXT:    v_writelane_b32 v41, s46, 14
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v41, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v41, s54, 14
+; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    ;DEBUG_VALUE: dummy:dummy <- undef
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    .loc 1 49 9 prologue_end ; dummy:49:9
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, __kmpc_alloc_shared at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, __kmpc_alloc_shared at gotpcrel32@hi+12
-; CHECK-NEXT:    v_writelane_b32 v41, s47, 15
-; CHECK-NEXT:    s_load_dwordx2 s[46:47], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_writelane_b32 v41, s55, 15
+; CHECK-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b32 s50, s15
+; CHECK-NEXT:    s_mov_b32 s51, s14
+; CHECK-NEXT:    s_mov_b32 s52, s13
+; CHECK-NEXT:    s_mov_b32 s53, s12
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[46:47]
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[54:55]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    s_mov_b32 s12, s53
+; CHECK-NEXT:    s_mov_b32 s13, s52
+; CHECK-NEXT:    s_mov_b32 s14, s51
+; CHECK-NEXT:    s_mov_b32 s15, s50
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[46:47]
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:    ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0]
 ; CHECK-NEXT:    .loc 1 0 9 is_stmt 0 ; dummy:0:9
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
-; CHECK-NEXT:    v_readlane_b32 s47, v41, 15
-; CHECK-NEXT:    v_readlane_b32 s46, v41, 14
-; CHECK-NEXT:    v_readlane_b32 s45, v41, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v41, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v41, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v41, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v41, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v41, 8
+; CHECK-NEXT:    v_readlane_b32 s55, v41, 15
+; CHECK-NEXT:    v_readlane_b32 s54, v41, 14
+; CHECK-NEXT:    v_readlane_b32 s53, v41, 13
+; CHECK-NEXT:    v_readlane_b32 s52, v41, 12
+; CHECK-NEXT:    v_readlane_b32 s51, v41, 11
+; CHECK-NEXT:    v_readlane_b32 s50, v41, 10
+; CHECK-NEXT:    v_readlane_b32 s49, v41, 9
+; CHECK-NEXT:    v_readlane_b32 s48, v41, 8
 ; CHECK-NEXT:    v_readlane_b32 s39, v41, 7
 ; CHECK-NEXT:    v_readlane_b32 s38, v41, 6
 ; CHECK-NEXT:    v_readlane_b32 s37, v41, 5

diff  --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
index 0714def30053d..7f370b2cca658 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -142,9 +142,8 @@ body:             |
     ; GFX1100-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22
     ; GFX1100-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
     ; GFX1100-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
-    ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc
-    ; GFX1100-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32
-    ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr40 = S_ADD_I32 $sgpr32, 8, implicit-def $scc
+    ; GFX1100-NEXT: renamable $sgpr31 = S_MOV_B32 killed $sgpr40
     ; GFX1100-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
     ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
     ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
@@ -168,9 +167,8 @@ body:             |
     ; GFX1200-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22
     ; GFX1200-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
     ; GFX1200-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
-    ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc
-    ; GFX1200-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32
-    ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr40 = S_ADD_I32 $sgpr32, 8, implicit-def $scc
+    ; GFX1200-NEXT: renamable $sgpr31 = S_MOV_B32 killed $sgpr40
     ; GFX1200-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
     ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
     ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
@@ -706,8 +704,9 @@ body:   |
     ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
     ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 24, 64, $sgpr32, 0, implicit $exec
-    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 24
+    ; GFX8-NEXT: $vgpr0, dead $sgpr72_sgpr73 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
     ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
     ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -810,10 +809,10 @@ body:   |
     ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
     ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc
-    ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
-    ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
-    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1100-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72
     ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
@@ -837,10 +836,10 @@ body:   |
     ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
     ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc
-    ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
-    ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
-    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1200-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72
     ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
@@ -900,9 +899,9 @@ body:             |
     ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
     ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 68, implicit $exec
-    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
-    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 68
+    ; GFX8-NEXT: $vgpr0, dead $sgpr72_sgpr73 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
     ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
     ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -1005,10 +1004,10 @@ body:             |
     ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
     ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc
-    ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
-    ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
-    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1100-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72
     ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
@@ -1032,10 +1031,10 @@ body:             |
     ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
     ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc
-    ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
-    ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
-    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1200-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72
     ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23

diff  --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 32f255df82499..dc20ae3765069 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -2060,9 +2060,9 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s29, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 vcc, -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, vcc
+; GFX9-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s28
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off offset:48
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s27

diff  --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index 22257d3eba7d6..512d58d3f996d 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -202,18 +202,18 @@ define void @indirect_use_50_vgpr() #0 {
 }
 
 ; GCN-LABEL: {{^}}use_80_sgpr:
-; GCN:	.set use_80_sgpr.num_vgpr, 1
+; GCN:	.set use_80_sgpr.num_vgpr, 0
 ; GCN:	.set use_80_sgpr.num_agpr, 0
 ; GCN:	.set use_80_sgpr.numbered_sgpr, 80
-; GCN:	.set use_80_sgpr.private_seg_size, 8
+; GCN:	.set use_80_sgpr.private_seg_size, 0
 ; GCN:	.set use_80_sgpr.uses_vcc, 0
 ; GCN:	.set use_80_sgpr.uses_flat_scratch, 0
 ; GCN:	.set use_80_sgpr.has_dyn_sized_stack, 0
 ; GCN:	.set use_80_sgpr.has_recursion, 0
 ; GCN:	.set use_80_sgpr.has_indirect_call, 0
 ; GCN: TotalNumSgprs: 84
-; GCN: NumVgprs: 1
-; GCN: ScratchSize: 8
+; GCN: NumVgprs: 0
+; GCN: ScratchSize: 0
 define void @use_80_sgpr() #1 {
   call void asm sideeffect "", "~{s79}"() #0
   ret void
@@ -231,7 +231,7 @@ define void @use_80_sgpr() #1 {
 ; GCN:	.set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call)
 ; GCN: TotalNumSgprs: 84
 ; GCN: NumVgprs: 41
-; GCN: ScratchSize: 24
+; GCN: ScratchSize: 16
 define void @indirect_use_80_sgpr() #1 {
   call void @use_80_sgpr()
   ret void
@@ -249,7 +249,7 @@ define void @indirect_use_80_sgpr() #1 {
 ; GCN:	.set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call)
 ; GCN: TotalNumSgprs: 86
 ; GCN: NumVgprs: 41
-; GCN: ScratchSize: 24
+; GCN: ScratchSize: 16
 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
   call void @indirect_use_80_sgpr()
   ret void
@@ -336,14 +336,14 @@ define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
 ; GCN-LABEL: {{^}}multi_call_use_use_stack:
 ; GCN:	.set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr)
 ; GCN:	.set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr)
-; GCN:	.set multi_call_use_use_stack.numbered_sgpr, max(44, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr)
+; GCN:	.set multi_call_use_use_stack.numbered_sgpr, max(52, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr)
 ; GCN:	.set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size))
 ; GCN:	.set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc)
 ; GCN:	.set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch)
 ; GCN:	.set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack)
 ; GCN:	.set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion)
 ; GCN:	.set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call)
-; GCN: TotalNumSgprs: 50
+; GCN: TotalNumSgprs: 58
 ; GCN: NumVgprs: 41
 ; GCN: ScratchSize: 2052
 define amdgpu_kernel void @multi_call_use_use_stack() #0 {
@@ -357,7 +357,7 @@ declare void @external() #0
 ; GCN-LABEL: {{^}}multi_call_with_external:
 ; GCN:	.set multi_call_with_external.num_vgpr, max(41, amdgpu.max_num_vgpr)
 ; GCN:	.set multi_call_with_external.num_agpr, max(0, amdgpu.max_num_agpr)
-; GCN:	.set multi_call_with_external.numbered_sgpr, max(44, amdgpu.max_num_sgpr)
+; GCN:	.set multi_call_with_external.numbered_sgpr, max(52, amdgpu.max_num_sgpr)
 ; GCN:	.set multi_call_with_external.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size))
 ; GCN:	.set multi_call_with_external.uses_vcc, 1
 ; GCN:	.set multi_call_with_external.uses_flat_scratch, 1
@@ -377,7 +377,7 @@ define amdgpu_kernel void @multi_call_with_external() #0 {
 ; GCN-LABEL: {{^}}multi_call_with_external_and_duplicates:
 ; GCN:	.set multi_call_with_external_and_duplicates.num_vgpr, max(41, amdgpu.max_num_vgpr)
 ; GCN:	.set multi_call_with_external_and_duplicates.num_agpr, max(0, amdgpu.max_num_agpr)
-; GCN:	.set multi_call_with_external_and_duplicates.numbered_sgpr, max(46, amdgpu.max_num_sgpr)
+; GCN:	.set multi_call_with_external_and_duplicates.numbered_sgpr, max(54, amdgpu.max_num_sgpr)
 ; GCN:	.set multi_call_with_external_and_duplicates.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size))
 ; GCN:	.set multi_call_with_external_and_duplicates.uses_vcc, 1
 ; GCN:	.set multi_call_with_external_and_duplicates.uses_flat_scratch, 1
@@ -594,7 +594,7 @@ define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 {
 ; GCN-LABEL: {{^}}multi_call_with_multi_stage_recurse:
 ; GCN:  .set multi_call_with_multi_stage_recurse.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr, multi_stage_recurse1.num_vgpr)
 ; GCN:  .set multi_call_with_multi_stage_recurse.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr, multi_stage_recurse1.num_agpr)
-; GCN:  .set multi_call_with_multi_stage_recurse.numbered_sgpr, max(45, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr, multi_stage_recurse1.numbered_sgpr)
+; GCN:  .set multi_call_with_multi_stage_recurse.numbered_sgpr, max(53, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr, multi_stage_recurse1.numbered_sgpr)
 ; GCN:  .set multi_call_with_multi_stage_recurse.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size, multi_stage_recurse1.private_seg_size))
 ; GCN:  .set multi_call_with_multi_stage_recurse.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc, multi_stage_recurse1.uses_vcc)
 ; GCN:  .set multi_call_with_multi_stage_recurse.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch, multi_stage_recurse1.uses_flat_scratch)

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 1ad365df2e8a8..8ac187eacf1fe 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -40,12 +40,44 @@ define amdgpu_gfx void @gfx_func() {
 ; SDAG-NEXT:    v_writelane_b32 v40, s28, 24
 ; SDAG-NEXT:    v_writelane_b32 v40, s29, 25
 ; SDAG-NEXT:    v_writelane_b32 v40, s30, 26
+; SDAG-NEXT:    v_writelane_b32 v40, s31, 27
+; SDAG-NEXT:    v_writelane_b32 v40, s72, 28
+; SDAG-NEXT:    v_writelane_b32 v40, s73, 29
+; SDAG-NEXT:    v_writelane_b32 v40, s74, 30
+; SDAG-NEXT:    v_writelane_b32 v40, s75, 31
+; SDAG-NEXT:    v_writelane_b32 v40, s76, 32
+; SDAG-NEXT:    v_writelane_b32 v40, s77, 33
+; SDAG-NEXT:    v_writelane_b32 v40, s78, 34
+; SDAG-NEXT:    v_writelane_b32 v40, s79, 35
+; SDAG-NEXT:    v_writelane_b32 v40, s88, 36
+; SDAG-NEXT:    v_writelane_b32 v40, s89, 37
+; SDAG-NEXT:    v_writelane_b32 v40, s90, 38
+; SDAG-NEXT:    v_writelane_b32 v40, s91, 39
+; SDAG-NEXT:    v_writelane_b32 v40, s92, 40
+; SDAG-NEXT:    v_writelane_b32 v40, s93, 41
+; SDAG-NEXT:    v_writelane_b32 v40, s94, 42
 ; SDAG-NEXT:    s_mov_b32 s35, extern_c_func at abs32@hi
 ; SDAG-NEXT:    s_mov_b32 s34, extern_c_func at abs32@lo
 ; SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; SDAG-NEXT:    s_addk_i32 s32, 0x400
-; SDAG-NEXT:    v_writelane_b32 v40, s31, 27
+; SDAG-NEXT:    v_writelane_b32 v40, s95, 43
 ; SDAG-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; SDAG-NEXT:    v_readlane_b32 s95, v40, 43
+; SDAG-NEXT:    v_readlane_b32 s94, v40, 42
+; SDAG-NEXT:    v_readlane_b32 s93, v40, 41
+; SDAG-NEXT:    v_readlane_b32 s92, v40, 40
+; SDAG-NEXT:    v_readlane_b32 s91, v40, 39
+; SDAG-NEXT:    v_readlane_b32 s90, v40, 38
+; SDAG-NEXT:    v_readlane_b32 s89, v40, 37
+; SDAG-NEXT:    v_readlane_b32 s88, v40, 36
+; SDAG-NEXT:    v_readlane_b32 s79, v40, 35
+; SDAG-NEXT:    v_readlane_b32 s78, v40, 34
+; SDAG-NEXT:    v_readlane_b32 s77, v40, 33
+; SDAG-NEXT:    v_readlane_b32 s76, v40, 32
+; SDAG-NEXT:    v_readlane_b32 s75, v40, 31
+; SDAG-NEXT:    v_readlane_b32 s74, v40, 30
+; SDAG-NEXT:    v_readlane_b32 s73, v40, 29
+; SDAG-NEXT:    v_readlane_b32 s72, v40, 28
 ; SDAG-NEXT:    v_readlane_b32 s31, v40, 27
 ; SDAG-NEXT:    v_readlane_b32 s30, v40, 26
 ; SDAG-NEXT:    v_readlane_b32 s29, v40, 25
@@ -117,12 +149,44 @@ define amdgpu_gfx void @gfx_func() {
 ; GISEL-NEXT:    v_writelane_b32 v40, s28, 24
 ; GISEL-NEXT:    v_writelane_b32 v40, s29, 25
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s72, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s73, 29
+; GISEL-NEXT:    v_writelane_b32 v40, s74, 30
+; GISEL-NEXT:    v_writelane_b32 v40, s75, 31
+; GISEL-NEXT:    v_writelane_b32 v40, s76, 32
+; GISEL-NEXT:    v_writelane_b32 v40, s77, 33
+; GISEL-NEXT:    v_writelane_b32 v40, s78, 34
+; GISEL-NEXT:    v_writelane_b32 v40, s79, 35
+; GISEL-NEXT:    v_writelane_b32 v40, s88, 36
+; GISEL-NEXT:    v_writelane_b32 v40, s89, 37
+; GISEL-NEXT:    v_writelane_b32 v40, s90, 38
+; GISEL-NEXT:    v_writelane_b32 v40, s91, 39
+; GISEL-NEXT:    v_writelane_b32 v40, s92, 40
+; GISEL-NEXT:    v_writelane_b32 v40, s93, 41
+; GISEL-NEXT:    v_writelane_b32 v40, s94, 42
 ; GISEL-NEXT:    s_mov_b32 s34, extern_c_func at abs32@lo
 ; GISEL-NEXT:    s_mov_b32 s35, extern_c_func at abs32@hi
 ; GISEL-NEXT:    s_mov_b64 s[8:9], 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s95, 43
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GISEL-NEXT:    v_readlane_b32 s95, v40, 43
+; GISEL-NEXT:    v_readlane_b32 s94, v40, 42
+; GISEL-NEXT:    v_readlane_b32 s93, v40, 41
+; GISEL-NEXT:    v_readlane_b32 s92, v40, 40
+; GISEL-NEXT:    v_readlane_b32 s91, v40, 39
+; GISEL-NEXT:    v_readlane_b32 s90, v40, 38
+; GISEL-NEXT:    v_readlane_b32 s89, v40, 37
+; GISEL-NEXT:    v_readlane_b32 s88, v40, 36
+; GISEL-NEXT:    v_readlane_b32 s79, v40, 35
+; GISEL-NEXT:    v_readlane_b32 s78, v40, 34
+; GISEL-NEXT:    v_readlane_b32 s77, v40, 33
+; GISEL-NEXT:    v_readlane_b32 s76, v40, 32
+; GISEL-NEXT:    v_readlane_b32 s75, v40, 31
+; GISEL-NEXT:    v_readlane_b32 s74, v40, 30
+; GISEL-NEXT:    v_readlane_b32 s73, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s72, v40, 28
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 27
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 26
 ; GISEL-NEXT:    v_readlane_b32 s29, v40, 25

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 2e3ca34af4c74..2322b29abaa10 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -9093,62 +9093,30 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s40, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s41, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s42, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s43, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s44, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s45, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s46, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s47, 15
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 16
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 17
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 18
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 19
-; GFX9-NEXT:    v_writelane_b32 v40, s52, 20
-; GFX9-NEXT:    v_writelane_b32 v40, s53, 21
-; GFX9-NEXT:    v_writelane_b32 v40, s54, 22
-; GFX9-NEXT:    v_writelane_b32 v40, s55, 23
-; GFX9-NEXT:    v_writelane_b32 v40, s56, 24
-; GFX9-NEXT:    v_writelane_b32 v40, s57, 25
-; GFX9-NEXT:    v_writelane_b32 v40, s58, 26
-; GFX9-NEXT:    v_writelane_b32 v40, s59, 27
-; GFX9-NEXT:    v_writelane_b32 v40, s60, 28
-; GFX9-NEXT:    v_writelane_b32 v40, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 13
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s62, 30
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 14
 ; GFX9-NEXT:    s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
-; GFX9-NEXT:    v_writelane_b32 v40, s63, 31
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 15
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s63, v40, 31
-; GFX9-NEXT:    v_readlane_b32 s62, v40, 30
-; GFX9-NEXT:    v_readlane_b32 s61, v40, 29
-; GFX9-NEXT:    v_readlane_b32 s60, v40, 28
-; GFX9-NEXT:    v_readlane_b32 s59, v40, 27
-; GFX9-NEXT:    v_readlane_b32 s58, v40, 26
-; GFX9-NEXT:    v_readlane_b32 s57, v40, 25
-; GFX9-NEXT:    v_readlane_b32 s56, v40, 24
-; GFX9-NEXT:    v_readlane_b32 s55, v40, 23
-; GFX9-NEXT:    v_readlane_b32 s54, v40, 22
-; GFX9-NEXT:    v_readlane_b32 s53, v40, 21
-; GFX9-NEXT:    v_readlane_b32 s52, v40, 20
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 19
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 18
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s47, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s46, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s45, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s44, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s43, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s42, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s41, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 8
 ; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
@@ -9193,55 +9161,23 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s37, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s38, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-NEXT:    v_writelane_b32 v40, s40, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s41, 9
-; GFX10-NEXT:    v_writelane_b32 v40, s42, 10
-; GFX10-NEXT:    v_writelane_b32 v40, s43, 11
-; GFX10-NEXT:    v_writelane_b32 v40, s44, 12
-; GFX10-NEXT:    v_writelane_b32 v40, s45, 13
-; GFX10-NEXT:    v_writelane_b32 v40, s46, 14
-; GFX10-NEXT:    v_writelane_b32 v40, s47, 15
-; GFX10-NEXT:    v_writelane_b32 v40, s48, 16
-; GFX10-NEXT:    v_writelane_b32 v40, s49, 17
-; GFX10-NEXT:    v_writelane_b32 v40, s50, 18
-; GFX10-NEXT:    v_writelane_b32 v40, s51, 19
-; GFX10-NEXT:    v_writelane_b32 v40, s52, 20
-; GFX10-NEXT:    v_writelane_b32 v40, s53, 21
-; GFX10-NEXT:    v_writelane_b32 v40, s54, 22
-; GFX10-NEXT:    v_writelane_b32 v40, s55, 23
-; GFX10-NEXT:    v_writelane_b32 v40, s56, 24
-; GFX10-NEXT:    v_writelane_b32 v40, s57, 25
-; GFX10-NEXT:    v_writelane_b32 v40, s58, 26
-; GFX10-NEXT:    v_writelane_b32 v40, s59, 27
-; GFX10-NEXT:    v_writelane_b32 v40, s60, 28
-; GFX10-NEXT:    v_writelane_b32 v40, s61, 29
-; GFX10-NEXT:    v_writelane_b32 v40, s62, 30
-; GFX10-NEXT:    v_writelane_b32 v40, s63, 31
+; GFX10-NEXT:    v_writelane_b32 v40, s48, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s50, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s51, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s52, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s53, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s54, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s55, 15
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s63, v40, 31
-; GFX10-NEXT:    v_readlane_b32 s62, v40, 30
-; GFX10-NEXT:    v_readlane_b32 s61, v40, 29
-; GFX10-NEXT:    v_readlane_b32 s60, v40, 28
-; GFX10-NEXT:    v_readlane_b32 s59, v40, 27
-; GFX10-NEXT:    v_readlane_b32 s58, v40, 26
-; GFX10-NEXT:    v_readlane_b32 s57, v40, 25
-; GFX10-NEXT:    v_readlane_b32 s56, v40, 24
-; GFX10-NEXT:    v_readlane_b32 s55, v40, 23
-; GFX10-NEXT:    v_readlane_b32 s54, v40, 22
-; GFX10-NEXT:    v_readlane_b32 s53, v40, 21
-; GFX10-NEXT:    v_readlane_b32 s52, v40, 20
-; GFX10-NEXT:    v_readlane_b32 s51, v40, 19
-; GFX10-NEXT:    v_readlane_b32 s50, v40, 18
-; GFX10-NEXT:    v_readlane_b32 s49, v40, 17
-; GFX10-NEXT:    v_readlane_b32 s48, v40, 16
-; GFX10-NEXT:    v_readlane_b32 s47, v40, 15
-; GFX10-NEXT:    v_readlane_b32 s46, v40, 14
-; GFX10-NEXT:    v_readlane_b32 s45, v40, 13
-; GFX10-NEXT:    v_readlane_b32 s44, v40, 12
-; GFX10-NEXT:    v_readlane_b32 s43, v40, 11
-; GFX10-NEXT:    v_readlane_b32 s42, v40, 10
-; GFX10-NEXT:    v_readlane_b32 s41, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s55, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s54, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s53, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s52, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s51, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s50, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s49, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s48, v40, 8
 ; GFX10-NEXT:    v_readlane_b32 s39, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s38, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s37, v40, 5
@@ -9281,57 +9217,25 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
 ; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
 ; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s40, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s41, 9
-; GFX11-NEXT:    v_writelane_b32 v40, s42, 10
-; GFX11-NEXT:    v_writelane_b32 v40, s43, 11
-; GFX11-NEXT:    v_writelane_b32 v40, s44, 12
-; GFX11-NEXT:    v_writelane_b32 v40, s45, 13
-; GFX11-NEXT:    v_writelane_b32 v40, s46, 14
-; GFX11-NEXT:    v_writelane_b32 v40, s47, 15
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 17
-; GFX11-NEXT:    v_writelane_b32 v40, s50, 18
-; GFX11-NEXT:    v_writelane_b32 v40, s51, 19
-; GFX11-NEXT:    v_writelane_b32 v40, s52, 20
-; GFX11-NEXT:    v_writelane_b32 v40, s53, 21
-; GFX11-NEXT:    v_writelane_b32 v40, s54, 22
-; GFX11-NEXT:    v_writelane_b32 v40, s55, 23
-; GFX11-NEXT:    v_writelane_b32 v40, s56, 24
-; GFX11-NEXT:    v_writelane_b32 v40, s57, 25
-; GFX11-NEXT:    v_writelane_b32 v40, s58, 26
-; GFX11-NEXT:    v_writelane_b32 v40, s59, 27
-; GFX11-NEXT:    v_writelane_b32 v40, s60, 28
-; GFX11-NEXT:    v_writelane_b32 v40, s61, 29
-; GFX11-NEXT:    v_writelane_b32 v40, s62, 30
-; GFX11-NEXT:    v_writelane_b32 v40, s63, 31
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s50, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s51, 11
+; GFX11-NEXT:    v_writelane_b32 v40, s52, 12
+; GFX11-NEXT:    v_writelane_b32 v40, s53, 13
+; GFX11-NEXT:    v_writelane_b32 v40, s54, 14
+; GFX11-NEXT:    v_writelane_b32 v40, s55, 15
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s63, v40, 31
-; GFX11-NEXT:    v_readlane_b32 s62, v40, 30
-; GFX11-NEXT:    v_readlane_b32 s61, v40, 29
-; GFX11-NEXT:    v_readlane_b32 s60, v40, 28
-; GFX11-NEXT:    v_readlane_b32 s59, v40, 27
-; GFX11-NEXT:    v_readlane_b32 s58, v40, 26
-; GFX11-NEXT:    v_readlane_b32 s57, v40, 25
-; GFX11-NEXT:    v_readlane_b32 s56, v40, 24
-; GFX11-NEXT:    v_readlane_b32 s55, v40, 23
-; GFX11-NEXT:    v_readlane_b32 s54, v40, 22
-; GFX11-NEXT:    v_readlane_b32 s53, v40, 21
-; GFX11-NEXT:    v_readlane_b32 s52, v40, 20
-; GFX11-NEXT:    v_readlane_b32 s51, v40, 19
-; GFX11-NEXT:    v_readlane_b32 s50, v40, 18
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 17
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 16
-; GFX11-NEXT:    v_readlane_b32 s47, v40, 15
-; GFX11-NEXT:    v_readlane_b32 s46, v40, 14
-; GFX11-NEXT:    v_readlane_b32 s45, v40, 13
-; GFX11-NEXT:    v_readlane_b32 s44, v40, 12
-; GFX11-NEXT:    v_readlane_b32 s43, v40, 11
-; GFX11-NEXT:    v_readlane_b32 s42, v40, 10
-; GFX11-NEXT:    v_readlane_b32 s41, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s55, v40, 15
+; GFX11-NEXT:    v_readlane_b32 s54, v40, 14
+; GFX11-NEXT:    v_readlane_b32 s53, v40, 13
+; GFX11-NEXT:    v_readlane_b32 s52, v40, 12
+; GFX11-NEXT:    v_readlane_b32 s51, v40, 11
+; GFX11-NEXT:    v_readlane_b32 s50, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
 ; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
@@ -9371,57 +9275,25 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s40, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s41, 9
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s42, 10
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s43, 11
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s44, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s45, 13
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s46, 14
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s47, 15
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 17
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 18
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 19
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 20
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 21
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 22
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 23
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s56, 24
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s57, 25
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s58, 26
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s59, 27
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s60, 28
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s61, 29
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s62, 30
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s63, 31
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 13
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 14
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s63, v40, 31
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s62, v40, 30
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s61, v40, 29
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s60, v40, 28
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s59, v40, 27
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s58, v40, 26
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s57, v40, 25
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s56, v40, 24
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 23
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 22
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 21
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 20
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 19
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 18
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 17
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s47, v40, 15
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s46, v40, 14
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s45, v40, 13
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s44, v40, 12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s43, v40, 11
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s42, v40, 10
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s41, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 15
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 14
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 11
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 8
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 5

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 15be44a335a1d..4f5c46d5f424f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -735,12 +735,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -760,8 +760,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -1955,12 +1955,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -1980,8 +1980,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -3235,12 +3235,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -3260,8 +3260,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -4011,12 +4011,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -4036,8 +4036,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -5316,12 +5316,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -5341,8 +5341,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -5747,14 +5747,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, s0, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, s1, v3
@@ -5763,16 +5763,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX7LESS-NEXT:    v_cvt_f64_u32_e32 v[1:2], s2
 ; GFX7LESS-NEXT:    v_or_b32_e32 v4, v0, v4
@@ -5785,11 +5785,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -5802,42 +5802,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX7LESS-NEXT:  .LBB9_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s0, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s1, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
@@ -5845,21 +5845,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b32 s33, s10
 ; GFX9-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-NEXT:    s_mov_b32 s42, s9
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-NEXT:    s_mov_b32 s50, s9
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:  .LBB9_2: ; %atomicrmw.start
@@ -5872,68 +5872,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v3, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v3, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v4, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX9-NEXT:  .LBB9_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b32 s33, s10
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], exec
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, s10, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, s11, v3
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
@@ -5950,69 +5950,69 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1064-NEXT:  .LBB9_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_mov_b32 s9, exec_lo
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, s9, 0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s0, s9
 ; GFX1032-NEXT:    s_mov_b32 s33, s10
 ; GFX1032-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
@@ -6029,37 +6029,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1032-NEXT:  .LBB9_3:
 ; GFX1032-NEXT:    s_endpgm
@@ -6070,7 +6070,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], exec
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6079,16 +6079,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1164-NEXT:    s_mov_b32 s42, s9
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s50, s9
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6111,18 +6111,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -6130,8 +6130,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1164-NEXT:  .LBB9_3:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6142,24 +6142,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_mov_b32 s8, exec_lo
 ; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s0, s8
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6179,25 +6179,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1132-NEXT:  .LBB9_3:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6206,14 +6206,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, s0, 0
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, s1, v3
@@ -6222,16 +6222,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX7LESS-DPP-NEXT:  ; %bb.1:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[0:1]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_cvt_f64_u32_e32 v[1:2], s2
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v4, v0, v4
@@ -6244,11 +6244,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -6261,42 +6261,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX7LESS-DPP-NEXT:  .LBB9_3:
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s0, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, s1, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
@@ -6304,21 +6304,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-DPP-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX9-DPP-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
@@ -6331,68 +6331,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v4, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v3, off, s[64:67], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v4, off, s[64:67], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX9-DPP-NEXT:  .LBB9_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], exec
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s10, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, s11, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-DPP-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
@@ -6409,69 +6409,69 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1064-DPP-NEXT:  .LBB9_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s9, exec_lo
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s9, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s0, s9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1032-DPP-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-DPP-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
@@ -6488,37 +6488,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1032-DPP-NEXT:  .LBB9_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -6529,7 +6529,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], exec
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6538,16 +6538,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-DPP-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6570,18 +6570,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -6589,8 +6589,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1164-DPP-NEXT:  .LBB9_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6601,24 +6601,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_mov_b32 s8, exec_lo
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s0, s8
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6638,25 +6638,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1132-DPP-NEXT:  .LBB9_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6669,19 +6669,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
@@ -6692,15 +6692,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
@@ -6725,21 +6725,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:  .LBB10_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -6752,44 +6752,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX7LESS-NEXT:  .LBB10_5:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -6801,14 +6801,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -6833,11 +6833,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX9-NEXT:  .LBB10_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -6848,53 +6848,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX9-NEXT:  .LBB10_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -6906,14 +6906,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
+; GFX1064-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -6938,11 +6938,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB10_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -6952,55 +6952,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX1064-NEXT:  .LBB10_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7012,14 +7012,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
+; GFX1032-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7038,16 +7038,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1032-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-NEXT:  .LBB10_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -7057,37 +7057,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX1032-NEXT:  .LBB10_5:
 ; GFX1032-NEXT:    s_endpgm
@@ -7095,11 +7095,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s42, s9
+; GFX1164-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7107,11 +7107,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
@@ -7143,11 +7143,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB10_4: ; %atomicrmw.start
@@ -7164,18 +7164,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7183,8 +7183,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX1164-NEXT:  .LBB10_5:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7193,7 +7193,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
@@ -7202,9 +7202,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
@@ -7233,17 +7233,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB10_4: ; %atomicrmw.start
@@ -7258,25 +7258,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX1132-NEXT:  .LBB10_5:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7285,22 +7285,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -7311,30 +7311,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v42, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -7347,44 +7347,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s54, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s52, s52, s11
-; GFX9-DPP-NEXT:    s_addc_u32 s53, s53, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s68, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s69, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s70, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s71, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s68, s68, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s69, s69, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7396,14 +7396,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7450,74 +7450,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s45, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s44, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s53, v9, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s52, v8, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[46:47], s[36:37], 0x24
-; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[54:55], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[64:65], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[46:47]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[54:55]
 ; GFX9-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], s[44:45]
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], s[52:53]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[52:55], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[52:55], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[52:55], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[68:71], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[68:71], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[68:71], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s46
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s47
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s54
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s55
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[68:71], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[48:49], vcc, s[48:49]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[48:49]
+; GFX9-DPP-NEXT:    s_or_b64 s[64:65], vcc, s[64:65]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[64:65]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7529,14 +7529,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7581,10 +7581,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -7594,55 +7594,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-DPP-NEXT:  .LBB10_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7654,14 +7654,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7696,14 +7696,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -7713,37 +7713,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-DPP-NEXT:  .LBB10_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -7751,11 +7751,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7763,11 +7763,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
@@ -7825,10 +7825,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
@@ -7845,18 +7845,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7864,8 +7864,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-DPP-NEXT:  .LBB10_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7874,7 +7874,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -7883,9 +7883,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
@@ -7936,14 +7936,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
@@ -7958,25 +7958,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-DPP-NEXT:  .LBB10_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -8922,12 +8922,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -8947,8 +8947,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
@@ -10355,12 +10355,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -10380,8 +10380,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
@@ -11270,12 +11270,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -11295,8 +11295,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
@@ -11771,13 +11771,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX7LESS-NEXT:    s_mov_b32 s1, 0x43300000
@@ -11791,15 +11791,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -11811,11 +11811,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -11828,40 +11828,40 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX7LESS-NEXT:  .LBB16_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
@@ -11874,20 +11874,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-NEXT:    s_mov_b32 s33, s10
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-NEXT:  .LBB16_2: ; %atomicrmw.start
@@ -11900,48 +11900,48 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX9-NEXT:  .LBB16_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX1064-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
@@ -11953,19 +11953,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-NEXT:    s_mov_b32 s33, s10
-; GFX1064-NEXT:    s_mov_b32 s42, s9
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_mov_b32 s50, s9
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
@@ -11978,53 +11978,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1064-NEXT:  .LBB16_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s1, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_add_f64 v[3:4], 0xc3300000, s[0:1]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    v_mul_f64 v[41:42], 4.0, v[3:4]
@@ -12033,18 +12033,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-NEXT:    s_mov_b32 s33, s10
-; GFX1032-NEXT:    s_mov_b32 s42, s9
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_mov_b32 s50, s9
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
@@ -12057,44 +12057,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1032-NEXT:  .LBB16_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
@@ -12115,16 +12115,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
-; GFX1164-NEXT:    s_mov_b32 s42, s9
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s50, s9
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
@@ -12145,18 +12145,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -12164,8 +12164,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-NEXT:  .LBB16_3:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -12173,12 +12173,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ;
 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0x43300000
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_clause 0x1
 ; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:20
 ; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:16
@@ -12193,15 +12193,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -12219,25 +12219,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-NEXT:  .LBB16_3:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -12246,13 +12246,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0
 ; GFX7LESS-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s1, 0x43300000
@@ -12266,15 +12266,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX7LESS-DPP-NEXT:  ; %bb.1:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -12286,11 +12286,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -12303,40 +12303,40 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX7LESS-DPP-NEXT:  .LBB16_3:
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
@@ -12349,20 +12349,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
@@ -12375,48 +12375,48 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX9-DPP-NEXT:  .LBB16_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
@@ -12428,19 +12428,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -12453,53 +12453,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1064-DPP-NEXT:  .LBB16_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s1, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], 0xc3300000, s[0:1]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[3:4]
@@ -12508,18 +12508,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -12532,44 +12532,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1032-DPP-NEXT:  .LBB16_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
@@ -12590,16 +12590,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -12620,18 +12620,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -12639,8 +12639,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-DPP-NEXT:  .LBB16_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -12648,12 +12648,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_clause 0x1
 ; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:20
 ; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:16
@@ -12668,15 +12668,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -12694,25 +12694,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-DPP-NEXT:  .LBB16_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -12725,19 +12725,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
@@ -12748,15 +12748,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
@@ -12781,21 +12781,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:  .LBB17_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_add_f64 v[2:3], v[0:1], v[41:42]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -12808,44 +12808,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX7LESS-NEXT:  .LBB17_5:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -12857,14 +12857,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -12889,11 +12889,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX9-NEXT:  .LBB17_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -12904,53 +12904,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX9-NEXT:  .LBB17_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -12962,14 +12962,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
+; GFX1064-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -12994,11 +12994,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB17_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -13008,55 +13008,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX1064-NEXT:  .LBB17_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13068,14 +13068,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
+; GFX1032-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -13094,16 +13094,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1032-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-NEXT:  .LBB17_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -13113,37 +13113,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX1032-NEXT:  .LBB17_5:
 ; GFX1032-NEXT:    s_endpgm
@@ -13151,11 +13151,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s42, s9
+; GFX1164-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13163,11 +13163,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
@@ -13199,11 +13199,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB17_4: ; %atomicrmw.start
@@ -13220,18 +13220,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -13239,8 +13239,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX1164-NEXT:  .LBB17_5:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -13249,7 +13249,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
@@ -13258,9 +13258,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
@@ -13289,17 +13289,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB17_4: ; %atomicrmw.start
@@ -13314,25 +13314,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX1132-NEXT:  .LBB17_5:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -13341,22 +13341,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -13367,30 +13367,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v42, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], v[40:41]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -13403,44 +13403,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s54, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s52, s52, s11
-; GFX9-DPP-NEXT:    s_addc_u32 s53, s53, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s68, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s69, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s70, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s71, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s68, s68, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s69, s69, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13452,14 +13452,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -13506,74 +13506,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s45, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s44, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s53, v9, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s52, v8, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[46:47], s[36:37], 0x24
-; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[54:55], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[64:65], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[46:47]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[54:55]
 ; GFX9-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], s[44:45]
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], s[52:53]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[52:55], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[52:55], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[52:55], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[68:71], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[68:71], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[68:71], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s46
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s47
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s54
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s55
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[68:71], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[48:49], vcc, s[48:49]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[48:49]
+; GFX9-DPP-NEXT:    s_or_b64 s[64:65], vcc, s[64:65]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[64:65]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX9-DPP-NEXT:  .LBB17_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13585,14 +13585,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -13637,10 +13637,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -13650,55 +13650,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX1064-DPP-NEXT:  .LBB17_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13710,14 +13710,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -13752,14 +13752,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -13769,37 +13769,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX1032-DPP-NEXT:  .LBB17_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -13807,11 +13807,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13819,11 +13819,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
@@ -13881,10 +13881,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
@@ -13901,18 +13901,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -13920,8 +13920,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX1164-DPP-NEXT:  .LBB17_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -13930,7 +13930,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -13939,9 +13939,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
@@ -13992,14 +13992,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
@@ -14014,25 +14014,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX1132-DPP-NEXT:  .LBB17_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index a4410bb9ed2d0..e1ba4a2b0bf2a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -635,12 +635,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -660,8 +660,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
@@ -1674,12 +1674,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -1699,8 +1699,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
@@ -2713,12 +2713,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -2738,8 +2738,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
@@ -3135,13 +3135,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -3149,15 +3149,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -3169,8 +3169,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], 4.0
@@ -3178,8 +3178,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
@@ -3188,59 +3188,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX7LESS-NEXT:  .LBB6_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-NEXT:    s_mov_b32 s33, s10
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-NEXT:  .LBB6_2: ; %atomicrmw.start
@@ -3253,36 +3253,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-NEXT:  .LBB6_3:
 ; GFX9-NEXT:    s_endpgm
@@ -3290,32 +3290,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-NEXT:    s_mov_b32 s33, s10
-; GFX1064-NEXT:    s_mov_b32 s42, s9
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_mov_b32 s50, s9
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
@@ -3328,38 +3328,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1064-NEXT:  .LBB6_3:
 ; GFX1064-NEXT:    s_endpgm
@@ -3367,31 +3367,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-NEXT:    s_mov_b32 s33, s10
-; GFX1032-NEXT:    s_mov_b32 s42, s9
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_mov_b32 s50, s9
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
@@ -3404,38 +3404,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1032-NEXT:  .LBB6_3:
 ; GFX1032-NEXT:    s_endpgm
@@ -3444,7 +3444,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3452,16 +3452,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
-; GFX1164-NEXT:    s_mov_b32 s42, s9
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s50, s9
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
@@ -3483,18 +3483,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -3502,8 +3502,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-NEXT:  .LBB6_3:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -3513,22 +3513,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -3547,26 +3547,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-NEXT:  .LBB6_3:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -3575,13 +3575,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -3589,15 +3589,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX7LESS-DPP-NEXT:  ; %bb.1:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -3609,8 +3609,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], 4.0
@@ -3618,8 +3618,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -3628,59 +3628,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX7LESS-DPP-NEXT:  .LBB6_3:
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
@@ -3693,36 +3693,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
@@ -3730,32 +3730,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -3768,38 +3768,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1064-DPP-NEXT:  .LBB6_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
@@ -3807,31 +3807,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -3844,38 +3844,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1032-DPP-NEXT:  .LBB6_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -3884,7 +3884,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3892,16 +3892,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -3923,18 +3923,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -3942,8 +3942,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-DPP-NEXT:  .LBB6_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -3953,22 +3953,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -3987,26 +3987,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-DPP-NEXT:  .LBB6_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -4019,19 +4019,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
@@ -4042,15 +4042,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
@@ -4077,19 +4077,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[41:42]
@@ -4097,8 +4097,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
@@ -4107,43 +4107,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX7LESS-NEXT:  .LBB7_5:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4155,14 +4155,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4189,12 +4189,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX9-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4205,54 +4205,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX9-NEXT:  .LBB7_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4264,14 +4264,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
+; GFX1064-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4298,12 +4298,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -4313,56 +4313,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1064-NEXT:  .LBB7_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4374,14 +4374,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
+; GFX1032-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4402,17 +4402,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1032-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1032-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -4422,38 +4422,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1032-NEXT:  .LBB7_5:
 ; GFX1032-NEXT:    s_endpgm
@@ -4461,11 +4461,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s42, s9
+; GFX1164-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4473,11 +4473,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
@@ -4511,12 +4511,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[52:53]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB7_4: ; %atomicrmw.start
@@ -4530,15 +4530,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
@@ -4553,8 +4553,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1164-NEXT:  .LBB7_5:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -4563,7 +4563,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
@@ -4572,9 +4572,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
@@ -4606,19 +4606,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[52:53]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB7_4: ; %atomicrmw.start
@@ -4632,16 +4632,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1132-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
 ; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
@@ -4653,8 +4653,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1132-NEXT:  .LBB7_5:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -4663,22 +4663,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -4689,34 +4689,34 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[52:55], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -4725,43 +4725,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s54, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s52, s52, s11
-; GFX9-DPP-NEXT:    s_addc_u32 s53, s53, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s68, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s69, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s70, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s71, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s68, s68, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s69, s69, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4773,14 +4773,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4834,20 +4834,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s45, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s44, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s53, v9, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s52, v8, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[46:47], s[36:37], 0x24
-; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[54:55], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[64:65], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[46:47]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[54:55]
 ; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[44:45], s[44:45]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[52:53], s[52:53]
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
@@ -4856,54 +4856,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[52:55], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[68:71], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[52:55], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[52:55], 0 offset:8
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[68:71], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[68:71], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s46
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s47
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s54
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s55
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[68:71], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[48:49], vcc, s[48:49]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[48:49]
+; GFX9-DPP-NEXT:    s_or_b64 s[64:65], vcc, s[64:65]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[64:65]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4915,14 +4915,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4975,10 +4975,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
@@ -4989,56 +4989,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1064-DPP-NEXT:  .LBB7_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -5050,14 +5050,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -5098,15 +5098,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -5116,38 +5116,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1032-DPP-NEXT:  .LBB7_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -5155,11 +5155,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -5167,11 +5167,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
@@ -5239,11 +5239,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
@@ -5261,18 +5261,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -5280,8 +5280,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1164-DPP-NEXT:  .LBB7_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -5290,7 +5290,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5299,9 +5299,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
@@ -5356,16 +5356,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
@@ -5381,26 +5381,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1132-DPP-NEXT:  .LBB7_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6162,12 +6162,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -6187,8 +6187,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
@@ -6683,13 +6683,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -6697,15 +6697,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -6717,8 +6717,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], 4.0
@@ -6726,8 +6726,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
@@ -6736,59 +6736,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX7LESS-NEXT:  .LBB10_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-NEXT:    s_mov_b32 s33, s10
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-NEXT:  .LBB10_2: ; %atomicrmw.start
@@ -6801,36 +6801,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-NEXT:  .LBB10_3:
 ; GFX9-NEXT:    s_endpgm
@@ -6838,32 +6838,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-NEXT:    s_mov_b32 s33, s10
-; GFX1064-NEXT:    s_mov_b32 s42, s9
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_mov_b32 s50, s9
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
@@ -6876,38 +6876,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-NEXT:  .LBB10_3:
 ; GFX1064-NEXT:    s_endpgm
@@ -6915,31 +6915,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-NEXT:    s_mov_b32 s33, s10
-; GFX1032-NEXT:    s_mov_b32 s42, s9
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_mov_b32 s50, s9
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
@@ -6952,38 +6952,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-NEXT:  .LBB10_3:
 ; GFX1032-NEXT:    s_endpgm
@@ -6992,7 +6992,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7000,16 +7000,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
-; GFX1164-NEXT:    s_mov_b32 s42, s9
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s50, s9
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
@@ -7031,18 +7031,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7050,8 +7050,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-NEXT:  .LBB10_3:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7061,22 +7061,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -7095,26 +7095,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-NEXT:  .LBB10_3:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7123,13 +7123,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -7137,15 +7137,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX7LESS-DPP-NEXT:  ; %bb.1:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -7157,8 +7157,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], 4.0
@@ -7166,8 +7166,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -7176,59 +7176,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX7LESS-DPP-NEXT:  .LBB10_3:
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
@@ -7241,36 +7241,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
@@ -7278,32 +7278,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -7316,38 +7316,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-DPP-NEXT:  .LBB10_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
@@ -7355,31 +7355,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -7392,38 +7392,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-DPP-NEXT:  .LBB10_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -7432,7 +7432,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7440,16 +7440,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -7471,18 +7471,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7490,8 +7490,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-DPP-NEXT:  .LBB10_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7501,22 +7501,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -7535,26 +7535,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-DPP-NEXT:  .LBB10_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7567,19 +7567,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
@@ -7590,15 +7590,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
@@ -7625,19 +7625,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[41:42]
@@ -7645,8 +7645,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
@@ -7655,43 +7655,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX7LESS-NEXT:  .LBB11_5:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -7703,14 +7703,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7737,12 +7737,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX9-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -7753,54 +7753,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX9-NEXT:  .LBB11_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -7812,14 +7812,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
+; GFX1064-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7846,12 +7846,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -7861,56 +7861,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1064-NEXT:  .LBB11_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -7922,14 +7922,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
+; GFX1032-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7950,17 +7950,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1032-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1032-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -7970,38 +7970,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1032-NEXT:  .LBB11_5:
 ; GFX1032-NEXT:    s_endpgm
@@ -8009,11 +8009,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s42, s9
+; GFX1164-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8021,11 +8021,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
@@ -8059,12 +8059,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[52:53]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB11_4: ; %atomicrmw.start
@@ -8078,15 +8078,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
@@ -8101,8 +8101,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1164-NEXT:  .LBB11_5:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -8111,7 +8111,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
@@ -8120,9 +8120,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
@@ -8154,19 +8154,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[52:53]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB11_4: ; %atomicrmw.start
@@ -8180,16 +8180,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1132-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
 ; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
@@ -8201,8 +8201,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1132-NEXT:  .LBB11_5:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -8211,22 +8211,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8237,34 +8237,34 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[52:55], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -8273,43 +8273,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s54, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s52, s52, s11
-; GFX9-DPP-NEXT:    s_addc_u32 s53, s53, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s68, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s69, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s70, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s71, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s68, s68, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s69, s69, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8321,14 +8321,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -8382,20 +8382,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s45, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s44, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s53, v9, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s52, v8, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[46:47], s[36:37], 0x24
-; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[54:55], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[64:65], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[46:47]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[54:55]
 ; GFX9-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[44:45], s[44:45]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[52:53], s[52:53]
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
@@ -8404,54 +8404,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[52:55], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[68:71], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[52:55], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[52:55], 0 offset:8
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[68:71], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[68:71], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s46
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s47
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s54
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s55
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[68:71], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[48:49], vcc, s[48:49]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[48:49]
+; GFX9-DPP-NEXT:    s_or_b64 s[64:65], vcc, s[64:65]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[64:65]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX9-DPP-NEXT:  .LBB11_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8463,14 +8463,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -8523,10 +8523,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
@@ -8537,56 +8537,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1064-DPP-NEXT:  .LBB11_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8598,14 +8598,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -8646,15 +8646,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -8664,38 +8664,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1032-DPP-NEXT:  .LBB11_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -8703,11 +8703,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8715,11 +8715,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
@@ -8787,11 +8787,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
@@ -8809,18 +8809,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -8828,8 +8828,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1164-DPP-NEXT:  .LBB11_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -8838,7 +8838,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8847,9 +8847,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
@@ -8904,16 +8904,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
@@ -8929,26 +8929,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1132-DPP-NEXT:  .LBB11_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 68d7dcc60506c..6b1d5253e178f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -635,12 +635,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -660,8 +660,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
@@ -1674,12 +1674,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -1699,8 +1699,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
@@ -2713,12 +2713,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -2738,8 +2738,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
@@ -3135,13 +3135,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -3149,15 +3149,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -3169,8 +3169,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], 4.0
@@ -3178,8 +3178,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
@@ -3188,59 +3188,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX7LESS-NEXT:  .LBB6_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-NEXT:    s_mov_b32 s33, s10
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-NEXT:  .LBB6_2: ; %atomicrmw.start
@@ -3253,36 +3253,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-NEXT:  .LBB6_3:
 ; GFX9-NEXT:    s_endpgm
@@ -3290,32 +3290,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-NEXT:    s_mov_b32 s33, s10
-; GFX1064-NEXT:    s_mov_b32 s42, s9
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_mov_b32 s50, s9
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
@@ -3328,38 +3328,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1064-NEXT:  .LBB6_3:
 ; GFX1064-NEXT:    s_endpgm
@@ -3367,31 +3367,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-NEXT:    s_mov_b32 s33, s10
-; GFX1032-NEXT:    s_mov_b32 s42, s9
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_mov_b32 s50, s9
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
@@ -3404,38 +3404,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1032-NEXT:  .LBB6_3:
 ; GFX1032-NEXT:    s_endpgm
@@ -3444,7 +3444,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3452,16 +3452,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
-; GFX1164-NEXT:    s_mov_b32 s42, s9
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s50, s9
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
@@ -3483,18 +3483,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -3502,8 +3502,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-NEXT:  .LBB6_3:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -3513,22 +3513,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -3547,26 +3547,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-NEXT:  .LBB6_3:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -3575,13 +3575,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -3589,15 +3589,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX7LESS-DPP-NEXT:  ; %bb.1:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -3609,8 +3609,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[2:3], 4.0
@@ -3618,8 +3618,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -3628,59 +3628,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX7LESS-DPP-NEXT:  .LBB6_3:
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
@@ -3693,36 +3693,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
@@ -3730,32 +3730,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -3768,38 +3768,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1064-DPP-NEXT:  .LBB6_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
@@ -3807,31 +3807,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -3844,38 +3844,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1032-DPP-NEXT:  .LBB6_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -3884,7 +3884,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3892,16 +3892,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -3923,18 +3923,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -3942,8 +3942,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-DPP-NEXT:  .LBB6_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -3953,22 +3953,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -3987,26 +3987,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-DPP-NEXT:  .LBB6_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -4019,19 +4019,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
@@ -4042,15 +4042,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
@@ -4077,19 +4077,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], v[41:42]
@@ -4097,8 +4097,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
@@ -4107,43 +4107,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX7LESS-NEXT:  .LBB7_5:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4155,14 +4155,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4189,12 +4189,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX9-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4205,54 +4205,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX9-NEXT:  .LBB7_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4264,14 +4264,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
+; GFX1064-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4298,12 +4298,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -4313,56 +4313,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1064-NEXT:  .LBB7_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4374,14 +4374,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
+; GFX1032-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4402,17 +4402,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1032-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1032-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -4422,38 +4422,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1032-NEXT:  .LBB7_5:
 ; GFX1032-NEXT:    s_endpgm
@@ -4461,11 +4461,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s42, s9
+; GFX1164-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4473,11 +4473,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
@@ -4511,12 +4511,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[52:53]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB7_4: ; %atomicrmw.start
@@ -4530,15 +4530,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
@@ -4553,8 +4553,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1164-NEXT:  .LBB7_5:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -4563,7 +4563,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
@@ -4572,9 +4572,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
@@ -4606,19 +4606,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[52:53]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB7_4: ; %atomicrmw.start
@@ -4632,16 +4632,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1132-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
 ; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
@@ -4653,8 +4653,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1132-NEXT:  .LBB7_5:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -4663,22 +4663,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -4689,34 +4689,34 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[52:55], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -4725,43 +4725,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s54, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s52, s52, s11
-; GFX9-DPP-NEXT:    s_addc_u32 s53, s53, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s68, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s69, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s70, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s71, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s68, s68, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s69, s69, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4773,14 +4773,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4834,20 +4834,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s45, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s44, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s53, v9, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s52, v8, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[46:47], s[36:37], 0x24
-; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[54:55], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[64:65], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[46:47]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[54:55]
 ; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[44:45], s[44:45]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[52:53], s[52:53]
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
@@ -4856,54 +4856,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[52:55], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[68:71], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[52:55], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[52:55], 0 offset:8
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[68:71], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[68:71], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s46
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s47
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s54
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s55
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[68:71], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[48:49], vcc, s[48:49]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[48:49]
+; GFX9-DPP-NEXT:    s_or_b64 s[64:65], vcc, s[64:65]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[64:65]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -4915,14 +4915,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -4975,10 +4975,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
@@ -4989,56 +4989,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1064-DPP-NEXT:  .LBB7_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -5050,14 +5050,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -5098,15 +5098,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -5116,38 +5116,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1032-DPP-NEXT:  .LBB7_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -5155,11 +5155,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -5167,11 +5167,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
@@ -5239,11 +5239,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
@@ -5261,18 +5261,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -5280,8 +5280,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1164-DPP-NEXT:  .LBB7_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -5290,7 +5290,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -5299,9 +5299,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
@@ -5356,16 +5356,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
@@ -5381,26 +5381,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1132-DPP-NEXT:  .LBB7_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6162,12 +6162,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -6187,8 +6187,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
@@ -6683,13 +6683,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -6697,15 +6697,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -6717,8 +6717,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], 4.0
@@ -6726,8 +6726,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
@@ -6736,59 +6736,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX7LESS-NEXT:  .LBB10_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-NEXT:    s_mov_b32 s33, s10
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-NEXT:  .LBB10_2: ; %atomicrmw.start
@@ -6801,36 +6801,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-NEXT:  .LBB10_3:
 ; GFX9-NEXT:    s_endpgm
@@ -6838,32 +6838,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-NEXT:    s_mov_b32 s33, s10
-; GFX1064-NEXT:    s_mov_b32 s42, s9
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_mov_b32 s50, s9
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
@@ -6876,38 +6876,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-NEXT:  .LBB10_3:
 ; GFX1064-NEXT:    s_endpgm
@@ -6915,31 +6915,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-NEXT:    s_mov_b32 s33, s10
-; GFX1032-NEXT:    s_mov_b32 s42, s9
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_mov_b32 s50, s9
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
@@ -6952,38 +6952,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-NEXT:  .LBB10_3:
 ; GFX1032-NEXT:    s_endpgm
@@ -6992,7 +6992,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7000,16 +7000,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
-; GFX1164-NEXT:    s_mov_b32 s42, s9
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s50, s9
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
@@ -7031,18 +7031,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7050,8 +7050,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-NEXT:  .LBB10_3:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7061,22 +7061,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -7095,26 +7095,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-NEXT:  .LBB10_3:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7123,13 +7123,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -7137,15 +7137,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX7LESS-DPP-NEXT:  ; %bb.1:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -7157,8 +7157,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[2:3], 4.0
@@ -7166,8 +7166,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -7176,59 +7176,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX7LESS-DPP-NEXT:  .LBB10_3:
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
@@ -7241,36 +7241,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
@@ -7278,32 +7278,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -7316,38 +7316,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-DPP-NEXT:  .LBB10_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
@@ -7355,31 +7355,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -7392,38 +7392,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-DPP-NEXT:  .LBB10_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -7432,7 +7432,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7440,16 +7440,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -7471,18 +7471,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7490,8 +7490,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-DPP-NEXT:  .LBB10_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7501,22 +7501,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -7535,26 +7535,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-DPP-NEXT:  .LBB10_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7567,19 +7567,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
@@ -7590,15 +7590,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
@@ -7625,19 +7625,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX7LESS-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], v[41:42]
@@ -7645,8 +7645,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
@@ -7655,43 +7655,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX7LESS-NEXT:  .LBB11_5:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -7703,14 +7703,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7737,12 +7737,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX9-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -7753,54 +7753,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX9-NEXT:  .LBB11_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -7812,14 +7812,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
+; GFX1064-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7846,12 +7846,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -7861,56 +7861,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1064-NEXT:  .LBB11_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -7922,14 +7922,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
+; GFX1032-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7950,17 +7950,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1032-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1032-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -7970,38 +7970,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1032-NEXT:  .LBB11_5:
 ; GFX1032-NEXT:    s_endpgm
@@ -8009,11 +8009,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s42, s9
+; GFX1164-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8021,11 +8021,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
@@ -8059,12 +8059,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[52:53]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB11_4: ; %atomicrmw.start
@@ -8078,15 +8078,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
@@ -8101,8 +8101,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1164-NEXT:  .LBB11_5:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -8111,7 +8111,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
@@ -8120,9 +8120,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
@@ -8154,19 +8154,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[52:53]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB11_4: ; %atomicrmw.start
@@ -8180,16 +8180,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1132-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
 ; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
@@ -8201,8 +8201,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1132-NEXT:  .LBB11_5:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -8211,22 +8211,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8237,34 +8237,34 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[52:55], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
@@ -8273,43 +8273,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s54, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s52, s52, s11
-; GFX9-DPP-NEXT:    s_addc_u32 s53, s53, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s68, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s69, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s70, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s71, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s68, s68, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s69, s69, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8321,14 +8321,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -8382,20 +8382,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s45, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s44, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s53, v9, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s52, v8, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[46:47], s[36:37], 0x24
-; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[54:55], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[64:65], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[46:47]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[54:55]
 ; GFX9-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[44:45], s[44:45]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[52:53], s[52:53]
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
@@ -8404,54 +8404,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[52:55], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[68:71], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[52:55], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[52:55], 0 offset:8
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[68:71], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[68:71], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s46
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s47
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s54
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s55
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[68:71], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[48:49], vcc, s[48:49]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[48:49]
+; GFX9-DPP-NEXT:    s_or_b64 s[64:65], vcc, s[64:65]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[64:65]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX9-DPP-NEXT:  .LBB11_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8463,14 +8463,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -8523,10 +8523,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
@@ -8537,56 +8537,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1064-DPP-NEXT:  .LBB11_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8598,14 +8598,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -8646,15 +8646,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -8664,38 +8664,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1032-DPP-NEXT:  .LBB11_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -8703,11 +8703,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
@@ -8715,11 +8715,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
@@ -8787,11 +8787,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
@@ -8809,18 +8809,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -8828,8 +8828,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1164-DPP-NEXT:  .LBB11_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -8838,7 +8838,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8847,9 +8847,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
@@ -8904,16 +8904,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
@@ -8929,26 +8929,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1132-DPP-NEXT:  .LBB11_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 7126680525b87..d575605f102b7 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -821,12 +821,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -846,8 +846,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -2153,12 +2153,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -2178,8 +2178,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -3485,12 +3485,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -3510,8 +3510,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -4313,12 +4313,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -4338,8 +4338,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -5644,12 +5644,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -5669,8 +5669,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
@@ -6075,14 +6075,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, s0, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, s1, v3
@@ -6091,16 +6091,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX7LESS-NEXT:    v_cvt_f64_u32_e32 v[1:2], s2
 ; GFX7LESS-NEXT:    v_or_b32_e32 v4, v0, v4
@@ -6113,11 +6113,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -6130,42 +6130,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX7LESS-NEXT:  .LBB9_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s0, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s1, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
@@ -6173,21 +6173,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b32 s33, s10
 ; GFX9-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-NEXT:    s_mov_b32 s42, s9
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-NEXT:    s_mov_b32 s50, s9
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:  .LBB9_2: ; %atomicrmw.start
@@ -6200,68 +6200,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v3, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v3, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v4, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX9-NEXT:  .LBB9_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b32 s33, s10
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], exec
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, s10, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, s11, v3
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
@@ -6278,69 +6278,69 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1064-NEXT:  .LBB9_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_mov_b32 s9, exec_lo
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, s9, 0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s0, s9
 ; GFX1032-NEXT:    s_mov_b32 s33, s10
 ; GFX1032-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
@@ -6357,37 +6357,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1032-NEXT:  .LBB9_3:
 ; GFX1032-NEXT:    s_endpgm
@@ -6398,7 +6398,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], exec
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6407,16 +6407,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1164-NEXT:    s_mov_b32 s42, s9
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s50, s9
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6439,18 +6439,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -6458,8 +6458,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1164-NEXT:  .LBB9_3:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6470,24 +6470,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_mov_b32 s8, exec_lo
 ; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s0, s8
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6507,25 +6507,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1132-NEXT:  .LBB9_3:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6534,14 +6534,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, s0, 0
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, s1, v3
@@ -6550,16 +6550,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX7LESS-DPP-NEXT:  ; %bb.1:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[0:1]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_cvt_f64_u32_e32 v[1:2], s2
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v4, v0, v4
@@ -6572,11 +6572,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -6589,42 +6589,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX7LESS-DPP-NEXT:  .LBB9_3:
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT:    s_mov_b32 s67, 0xe00000
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s0, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, s1, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
@@ -6632,21 +6632,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-DPP-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX9-DPP-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
@@ -6659,68 +6659,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v4, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v3, off, s[64:67], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v4, off, s[64:67], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX9-DPP-NEXT:  .LBB9_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], exec
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s10, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, s11, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-DPP-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
@@ -6737,69 +6737,69 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1064-DPP-NEXT:  .LBB9_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s9, exec_lo
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s9, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s0, s9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1032-DPP-NEXT:    v_cvt_f64_u32_e32 v[3:4], s0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-DPP-NEXT:    v_mul_f64 v[41:42], v[3:4], 4.0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
@@ -6816,37 +6816,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1032-DPP-NEXT:  .LBB9_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -6857,7 +6857,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], exec
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6866,16 +6866,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-DPP-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6898,18 +6898,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -6917,8 +6917,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1164-DPP-NEXT:  .LBB9_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6929,24 +6929,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_mov_b32 s8, exec_lo
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s0, s8
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[41:42], v[0:1], 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6966,25 +6966,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB9_2
 ; GFX1132-DPP-NEXT:  .LBB9_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -6997,19 +6997,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
@@ -7020,15 +7020,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
@@ -7053,21 +7053,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:  .LBB10_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -7080,44 +7080,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX7LESS-NEXT:  .LBB10_5:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7129,14 +7129,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7161,11 +7161,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX9-NEXT:  .LBB10_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -7176,53 +7176,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX9-NEXT:  .LBB10_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7234,14 +7234,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
+; GFX1064-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7266,11 +7266,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB10_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -7280,55 +7280,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX1064-NEXT:  .LBB10_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7340,14 +7340,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
+; GFX1032-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7366,16 +7366,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1032-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-NEXT:  .LBB10_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -7385,37 +7385,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX1032-NEXT:  .LBB10_5:
 ; GFX1032-NEXT:    s_endpgm
@@ -7423,11 +7423,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s42, s9
+; GFX1164-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7435,11 +7435,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
@@ -7471,11 +7471,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB10_4: ; %atomicrmw.start
@@ -7492,18 +7492,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7511,8 +7511,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX1164-NEXT:  .LBB10_5:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7521,7 +7521,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
@@ -7530,9 +7530,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
@@ -7561,17 +7561,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB10_4: ; %atomicrmw.start
@@ -7586,25 +7586,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX1132-NEXT:  .LBB10_5:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -7613,22 +7613,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -7639,30 +7639,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v42, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -7675,44 +7675,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s54, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s52, s52, s11
-; GFX9-DPP-NEXT:    s_addc_u32 s53, s53, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s68, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s69, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s70, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s71, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s68, s68, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s69, s69, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7724,14 +7724,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7778,74 +7778,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s45, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s44, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s53, v9, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s52, v8, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[46:47], s[36:37], 0x24
-; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[54:55], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[64:65], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[46:47]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[54:55]
 ; GFX9-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -s[44:45]
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -s[52:53]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[52:55], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[52:55], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[52:55], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[68:71], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[68:71], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[68:71], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s46
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s47
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s54
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s55
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[68:71], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[48:49], vcc, s[48:49]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[48:49]
+; GFX9-DPP-NEXT:    s_or_b64 s[64:65], vcc, s[64:65]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[64:65]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7857,14 +7857,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -7909,10 +7909,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -7922,55 +7922,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1064-DPP-NEXT:  .LBB10_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -7982,14 +7982,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -8024,14 +8024,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -8041,37 +8041,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1032-DPP-NEXT:  .LBB10_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -8079,11 +8079,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -8091,11 +8091,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
@@ -8153,10 +8153,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
@@ -8173,18 +8173,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -8192,8 +8192,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-DPP-NEXT:  .LBB10_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -8202,7 +8202,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -8211,9 +8211,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
@@ -8264,14 +8264,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
@@ -8286,25 +8286,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-DPP-NEXT:  .LBB10_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -9249,12 +9249,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -9274,8 +9274,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
@@ -10682,12 +10682,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -10707,8 +10707,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
@@ -11597,12 +11597,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s8
@@ -11622,8 +11622,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
@@ -12097,13 +12097,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX7LESS-NEXT:    s_mov_b32 s1, 0x43300000
@@ -12117,15 +12117,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -12137,11 +12137,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -12154,40 +12154,40 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX7LESS-NEXT:  .LBB16_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
@@ -12200,20 +12200,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-NEXT:    s_mov_b32 s33, s10
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-NEXT:  .LBB16_2: ; %atomicrmw.start
@@ -12226,48 +12226,48 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX9-NEXT:  .LBB16_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX1064-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
@@ -12279,19 +12279,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-NEXT:    s_mov_b32 s33, s10
-; GFX1064-NEXT:    s_mov_b32 s42, s9
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_mov_b32 s50, s9
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
@@ -12304,53 +12304,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1064-NEXT:  .LBB16_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s1, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_add_f64 v[3:4], 0xc3300000, s[0:1]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    v_mul_f64 v[41:42], 4.0, v[3:4]
@@ -12359,18 +12359,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-NEXT:    s_mov_b32 s33, s10
-; GFX1032-NEXT:    s_mov_b32 s42, s9
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_mov_b32 s50, s9
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
@@ -12383,44 +12383,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1032-NEXT:  .LBB16_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
@@ -12441,16 +12441,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
-; GFX1164-NEXT:    s_mov_b32 s42, s9
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s50, s9
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
@@ -12471,18 +12471,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -12490,8 +12490,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-NEXT:  .LBB16_3:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -12499,12 +12499,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ;
 ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0x43300000
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_clause 0x1
 ; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:20
 ; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:16
@@ -12519,15 +12519,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -12545,25 +12545,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-NEXT:  .LBB16_3:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -12572,13 +12572,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0
 ; GFX7LESS-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s1, 0x43300000
@@ -12592,15 +12592,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX7LESS-DPP-NEXT:  ; %bb.1:
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
@@ -12612,11 +12612,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -12629,40 +12629,40 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX7LESS-DPP-NEXT:  .LBB16_3:
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
@@ -12675,20 +12675,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX9-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
@@ -12701,48 +12701,48 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX9-DPP-NEXT:  .LBB16_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
@@ -12754,19 +12754,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -12779,53 +12779,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1064-DPP-NEXT:  .LBB16_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s1, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], 0xc3300000, s[0:1]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    v_mul_f64 v[41:42], 4.0, v[3:4]
@@ -12834,18 +12834,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x24
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[44:45], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[52:53], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -12858,44 +12858,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1032-DPP-NEXT:  .LBB16_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
@@ -12916,16 +12916,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
@@ -12946,18 +12946,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -12965,8 +12965,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-DPP-NEXT:  .LBB16_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -12974,12 +12974,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_clause 0x1
 ; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:20
 ; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:16
@@ -12994,15 +12994,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[44:45], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[52:53], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -13020,25 +13020,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-DPP-NEXT:  .LBB16_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -13051,19 +13051,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
@@ -13074,15 +13074,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
@@ -13107,21 +13107,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-NEXT:  .LBB17_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    v_add_f64 v[2:3], v[0:1], -v[41:42]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -13134,44 +13134,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX7LESS-NEXT:  .LBB17_5:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s11
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
+; GFX9-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s66, -1
+; GFX9-NEXT:    s_mov_b32 s67, 0xe00000
+; GFX9-NEXT:    s_add_u32 s64, s64, s11
+; GFX9-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s43, s8
+; GFX9-NEXT:    s_mov_b32 s51, s8
 ; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_mov_b32 s42, s9
+; GFX9-NEXT:    s_mov_b32 s50, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13183,14 +13183,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -13215,11 +13215,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[52:53], s[36:37], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX9-NEXT:  .LBB17_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -13230,53 +13230,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX9-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s43
-; GFX9-NEXT:    s_mov_b32 s13, s42
+; GFX9-NEXT:    s_mov_b32 s12, s51
+; GFX9-NEXT:    s_mov_b32 s13, s50
 ; GFX9-NEXT:    s_mov_b32 s14, s33
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-NEXT:    v_mov_b32_e32 v3, s45
+; GFX9-NEXT:    v_mov_b32_e32 v2, s52
+; GFX9-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX9-NEXT:  .LBB17_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s66, -1
+; GFX1064-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s43, s8
+; GFX1064-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s42, s9
+; GFX1064-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13288,14 +13288,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
+; GFX1064-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -13320,11 +13320,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB17_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -13334,55 +13334,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s43
-; GFX1064-NEXT:    s_mov_b32 s13, s42
+; GFX1064-NEXT:    s_mov_b32 s12, s51
+; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX1064-NEXT:  .LBB17_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s66, -1
+; GFX1032-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s43, s8
+; GFX1032-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s42, s9
+; GFX1032-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13394,14 +13394,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
+; GFX1032-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -13420,16 +13420,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s46, 0
+; GFX1032-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1032-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-NEXT:  .LBB17_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -13439,37 +13439,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s43
-; GFX1032-NEXT:    s_mov_b32 s13, s42
+; GFX1032-NEXT:    s_mov_b32 s12, s51
+; GFX1032-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX1032-NEXT:  .LBB17_5:
 ; GFX1032-NEXT:    s_endpgm
@@ -13477,11 +13477,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s43, s8
+; GFX1164-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s42, s9
+; GFX1164-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1164-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13489,11 +13489,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
@@ -13525,11 +13525,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-NEXT:    .p2align 6
 ; GFX1164-NEXT:  .LBB17_4: ; %atomicrmw.start
@@ -13546,18 +13546,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s43
-; GFX1164-NEXT:    s_mov_b32 s13, s42
+; GFX1164-NEXT:    s_mov_b32 s12, s51
+; GFX1164-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -13565,8 +13565,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX1164-NEXT:  .LBB17_5:
 ; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -13575,7 +13575,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[0:1]
@@ -13584,9 +13584,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-NEXT:    s_mov_b32 s42, s14
-; GFX1132-NEXT:    s_mov_b32 s43, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b32 s50, s14
+; GFX1132-NEXT:    s_mov_b32 s51, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
@@ -13615,17 +13615,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s46, 0
+; GFX1132-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB17_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-NEXT:    .p2align 6
 ; GFX1132-NEXT:  .LBB17_4: ; %atomicrmw.start
@@ -13640,25 +13640,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s43
-; GFX1132-NEXT:    s_mov_b32 s13, s42
+; GFX1132-NEXT:    s_mov_b32 s12, s51
+; GFX1132-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB17_4
 ; GFX1132-NEXT:  .LBB17_5:
 ; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -13667,22 +13667,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX7LESS-DPP:       ; %bb.0:
 ; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s11
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s67, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s64, s64, s11
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s65, s65, 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s10
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, s9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, s9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[4:5], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[52:53], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s55, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s54, -1
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -13693,30 +13693,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v42, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[0:1], off, s[52:55], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX7LESS-DPP-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[0:1], -v[40:41]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[64:67], 0
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:12
+; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:8
 ; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -13729,44 +13729,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v42
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s52
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[64:67], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0 offset:4
 ; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s54, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s52, s52, s11
-; GFX9-DPP-NEXT:    s_addc_u32 s53, s53, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s68, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s69, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s70, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s71, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s68, s68, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s69, s69, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX9-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX9-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13778,14 +13778,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -13832,74 +13832,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s45, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s44, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s53, v9, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s52, v8, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[46:47], s[36:37], 0x24
-; GFX9-DPP-NEXT:    s_mov_b64 s[48:49], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[54:55], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[64:65], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[46:47]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[54:55]
 ; GFX9-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -s[44:45]
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[1:2], -s[52:53]
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[52:55], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[52:55], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[52:55], 0 offset:8
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[68:69]
+; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[68:71], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[68:71], 0 offset:12
+; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[68:71], 0 offset:8
 ; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[70:71]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s46
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s47
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s54
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s55
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[52:55], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4
+; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[68:71], 0
+; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[68:71], 0 offset:4
 ; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[48:49], vcc, s[48:49]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[48:49]
+; GFX9-DPP-NEXT:    s_or_b64 s[64:65], vcc, s[64:65]
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[64:65]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX9-DPP-NEXT:  .LBB17_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1064-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s67, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -13911,14 +13911,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -13963,10 +13963,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1064-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -13976,55 +13976,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX1064-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[54:55]
 ; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX1064-DPP-NEXT:  .LBB17_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT:    s_mov_b32 s64, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s65, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s66, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s67, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s64, s64, s11
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s65, s65, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -14036,14 +14036,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -14078,14 +14078,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
 ; GFX1032-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -14095,37 +14095,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[64:67], 0
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[64:65]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[66:67]
+; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[64:67], 0
+; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
 ; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; GFX1032-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
 ; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX1032-DPP-NEXT:  .LBB17_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
@@ -14133,11 +14133,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s43, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s51, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s42, s9
+; GFX1164-DPP-NEXT:    s_mov_b32 s50, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
 ; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
@@ -14145,11 +14145,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s33, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
@@ -14207,10 +14207,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-DPP-NEXT:    .p2align 6
 ; GFX1164-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
@@ -14227,18 +14227,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s44
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s45
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s53
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -14246,8 +14246,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[46:47]
+; GFX1164-DPP-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[54:55]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX1164-DPP-NEXT:  .LBB17_3:
 ; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
@@ -14256,7 +14256,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[40:41], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[48:49], s[0:1]
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
@@ -14265,9 +14265,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GFX1132-DPP-NEXT:    s_mov_b32 s42, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s43, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b32 s50, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s51, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
@@ -14318,14 +14318,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s54, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_load_b64 s[52:53], s[34:35], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[52:53]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-DPP-NEXT:    .p2align 6
 ; GFX1132-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
@@ -14340,25 +14340,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s43
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s42
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s51
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s50
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
 ; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
 ; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s46, vcc_lo, s46
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s46
+; GFX1132-DPP-NEXT:    s_or_b32 s54, vcc_lo, s54
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s54
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB17_2
 ; GFX1132-DPP-NEXT:  .LBB17_3:
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2

diff  --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
index dde84af57ed25..da1175c02e94a 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
@@ -31,102 +31,105 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   renamable $sgpr33 = COPY $sgpr15
-  ; CHECK-NEXT:   renamable $sgpr42 = COPY $sgpr14
+  ; CHECK-NEXT:   renamable $sgpr50 = COPY $sgpr14
   ; CHECK-NEXT:   renamable $sgpr36_sgpr37 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   renamable $sgpr38_sgpr39 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   renamable $sgpr40_sgpr41 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   renamable $sgpr66_sgpr67 = S_LOAD_DWORDX2_IMM renamable $sgpr34_sgpr35, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
-  ; CHECK-NEXT:   renamable $sgpr44 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr45 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr46 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr47 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr48 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr49 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr50 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr51 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr52 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr53 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr54 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr55 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr56 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr57 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr58 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr59 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr60 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr61 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr62 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr63 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr64 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $sgpr48_sgpr49 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65 = S_LOAD_DWORDX2_IMM renamable $sgpr34_sgpr35, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+  ; CHECK-NEXT:   renamable $sgpr68 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr69 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr70 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr71 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr72 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr73 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr74 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr75 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr76 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr77 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr78 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr79 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr80 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr81 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr82 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr83 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr84 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr85 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr86 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr87 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr88 = S_MOV_B32 0
+  ; CHECK-NEXT:   SI_SPILL_S1024_SAVE renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53 = IMPLICIT_DEF
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL renamable $sgpr68_sgpr69, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL renamable $sgpr52_sgpr53, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY killed renamable $sgpr40_sgpr41
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY killed renamable $sgpr48_sgpr49
   ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY killed renamable $sgpr38_sgpr39
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY killed renamable $sgpr34_sgpr35
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY killed renamable $sgpr36_sgpr37
-  ; CHECK-NEXT:   $sgpr12 = COPY killed renamable $sgpr42
+  ; CHECK-NEXT:   $sgpr12 = COPY killed renamable $sgpr50
   ; CHECK-NEXT:   $sgpr13 = COPY killed renamable $sgpr33
-  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr68_sgpr69, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr52_sgpr53, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = COPY $exec, implicit-def $exec
   ; CHECK-NEXT:   dead renamable $sgpr6_sgpr7 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
-  ; CHECK-NEXT:   renamable $sgpr6 = S_LSHL_B32 renamable $sgpr67, 1, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99
+  ; CHECK-NEXT:   renamable $sgpr6 = S_LSHL_B32 renamable $sgpr65, 1, implicit-def dead $scc
   ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = S_OR_SAVEEXEC_B64 renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr68 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr69 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr70 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr71 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr72 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr73 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr74 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr75 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr76 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr77 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr78 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr79 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr80 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr81 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr82 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr83 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr84 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr85 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr86 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr87 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr88 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   renamable $sgpr89 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr36 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr37 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr38 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr39 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr40 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr41 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr42 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr43 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr44 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr45 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr46 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr47 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr48 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr49 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr50 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr51 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr52 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr53 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr54 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr55 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr56 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr57 = COPY killed renamable $sgpr68
+  ; CHECK-NEXT:   dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec
   ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7, $sgpr66_sgpr67:0x0000000000000003, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7, $sgpr64_sgpr65:0x0000000000000003
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc
-  ; CHECK-NEXT:   dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr66, 1, implicit-def dead $scc
-  ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
+  ; CHECK-NEXT:   dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr64, 1, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5)
+  ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   bb.0:
@@ -211,7 +214,6 @@ body:             |
     %15.sub19:sgpr_1024 = COPY %7.sub0
     %15.sub20:sgpr_1024 = COPY %7.sub0
     %15.sub21:sgpr_1024 = COPY %7.sub0
-    ; Spill code ends up getting inserted here, and we end up with many unspillable sgpr1024 ranges
     %16:vreg_1024 = COPY %15, implicit $exec
     $exec = S_XOR_B64_term $exec, %14, implicit-def $scc
     S_CBRANCH_EXECZ %bb.5, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 81eac63ae5bdf..8dbd6c5d133ea 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -16,28 +16,20 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v5, s36, 4
 ; CHECK-NEXT:    v_writelane_b32 v5, s37, 5
 ; CHECK-NEXT:    v_writelane_b32 v5, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v5, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v5, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v5, s41, 9
-; CHECK-NEXT:    v_writelane_b32 v5, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v5, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v5, s44, 12
-; CHECK-NEXT:    v_writelane_b32 v5, s45, 13
-; CHECK-NEXT:    v_writelane_b32 v5, s46, 14
 ; CHECK-NEXT:    s_getpc_b64 s[24:25]
-; CHECK-NEXT:    v_writelane_b32 v5, s47, 15
+; CHECK-NEXT:    v_writelane_b32 v5, s39, 7
 ; CHECK-NEXT:    s_movk_i32 s20, 0xf0
 ; CHECK-NEXT:    s_mov_b32 s21, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s48, 16
+; CHECK-NEXT:    v_writelane_b32 v5, s48, 8
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[20:21], 0
-; CHECK-NEXT:    v_writelane_b32 v5, s49, 17
+; CHECK-NEXT:    v_writelane_b32 v5, s49, 9
 ; CHECK-NEXT:    s_load_dwordx4 s[20:23], s[20:21], 0x0
-; CHECK-NEXT:    v_writelane_b32 v5, s50, 18
+; CHECK-NEXT:    v_writelane_b32 v5, s50, 10
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_movk_i32 s22, 0x130
 ; CHECK-NEXT:    s_mov_b32 s23, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s51, 19
+; CHECK-NEXT:    v_writelane_b32 v5, s51, 11
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0x0
 ; CHECK-NEXT:    s_mov_b32 s28, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
@@ -49,60 +41,51 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    image_sample_lz v3, v[2:3], s[12:19], s[28:31] dmask:0x1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v1
 ; CHECK-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_writelane_b32 v5, s52, 20
+; CHECK-NEXT:    v_writelane_b32 v5, s52, 12
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_writelane_b32 v6, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s53, 21
-; CHECK-NEXT:    v_writelane_b32 v5, s54, 22
-; CHECK-NEXT:    v_writelane_b32 v5, s55, 23
-; CHECK-NEXT:    v_writelane_b32 v5, s56, 24
 ; CHECK-NEXT:    v_writelane_b32 v6, s37, 1
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v5, s57, 25
 ; CHECK-NEXT:    v_writelane_b32 v6, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v5, s58, 26
 ; CHECK-NEXT:    v_writelane_b32 v6, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v5, s59, 27
 ; CHECK-NEXT:    v_writelane_b32 v6, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v5, s60, 28
 ; CHECK-NEXT:    v_writelane_b32 v6, s41, 5
-; CHECK-NEXT:    v_writelane_b32 v5, s61, 29
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1
 ; CHECK-NEXT:    v_writelane_b32 v6, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v5, s62, 30
 ; CHECK-NEXT:    v_writelane_b32 v6, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v5, s63, 31
 ; CHECK-NEXT:    v_writelane_b32 v6, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v5, s64, 32
 ; CHECK-NEXT:    v_writelane_b32 v6, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v5, s65, 33
+; CHECK-NEXT:    v_writelane_b32 v5, s53, 13
 ; CHECK-NEXT:    v_writelane_b32 v6, s46, 10
-; CHECK-NEXT:    v_writelane_b32 v5, s66, 34
+; CHECK-NEXT:    v_writelane_b32 v5, s54, 14
 ; CHECK-NEXT:    v_writelane_b32 v6, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v5, s67, 35
+; CHECK-NEXT:    v_writelane_b32 v5, s55, 15
 ; CHECK-NEXT:    v_writelane_b32 v6, s48, 12
-; CHECK-NEXT:    v_writelane_b32 v5, s68, 36
+; CHECK-NEXT:    v_writelane_b32 v5, s64, 16
 ; CHECK-NEXT:    v_writelane_b32 v6, s49, 13
-; CHECK-NEXT:    v_writelane_b32 v5, s69, 37
+; CHECK-NEXT:    v_writelane_b32 v5, s65, 17
 ; CHECK-NEXT:    v_writelane_b32 v6, s50, 14
-; CHECK-NEXT:    s_mov_b32 s34, 48
-; CHECK-NEXT:    s_movk_i32 s52, 0x1f0
-; CHECK-NEXT:    s_movk_i32 s68, 0x2f0
-; CHECK-NEXT:    s_mov_b32 s35, s24
-; CHECK-NEXT:    s_mov_b32 s53, s24
-; CHECK-NEXT:    s_mov_b32 s69, s24
+; CHECK-NEXT:    v_writelane_b32 v5, s66, 18
 ; CHECK-NEXT:    v_writelane_b32 v6, s51, 15
-; CHECK-NEXT:    s_load_dwordx8 s[20:27], s[34:35], 0x0
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[52:53], 0x0
+; CHECK-NEXT:    s_mov_b32 s40, 48
+; CHECK-NEXT:    s_movk_i32 s56, 0x1f0
+; CHECK-NEXT:    s_movk_i32 s34, 0x2f0
+; CHECK-NEXT:    s_mov_b32 s41, s24
+; CHECK-NEXT:    s_mov_b32 s57, s24
+; CHECK-NEXT:    s_mov_b32 s35, s24
+; CHECK-NEXT:    v_writelane_b32 v5, s67, 19
+; CHECK-NEXT:    s_load_dwordx8 s[20:27], s[40:41], 0x0
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[56:57], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[68:69], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[34:35], 0x0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    v_writelane_b32 v5, s70, 38
-; CHECK-NEXT:    s_xor_b64 s[34:35], vcc, -1
-; CHECK-NEXT:    v_writelane_b32 v5, s71, 39
+; CHECK-NEXT:    v_writelane_b32 v5, s68, 20
+; CHECK-NEXT:    s_xor_b64 s[72:73], vcc, -1
+; CHECK-NEXT:    v_writelane_b32 v5, s69, 21
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v4, v3
-; CHECK-NEXT:    s_and_saveexec_b64 vcc, s[34:35]
-; CHECK-NEXT:    s_xor_b64 s[68:69], exec, vcc
+; CHECK-NEXT:    s_and_saveexec_b64 vcc, s[72:73]
+; CHECK-NEXT:    s_xor_b64 s[34:35], exec, vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
 ; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[12:19], s[28:31] dmask:0x1
@@ -124,10 +107,10 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[12:13], s[68:69]
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[12:13], s[34:35]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
-; CHECK-NEXT:    s_and_saveexec_b64 s[14:15], s[34:35]
+; CHECK-NEXT:    s_and_saveexec_b64 s[14:15], s[72:73]
 ; CHECK-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
@@ -201,39 +184,21 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[12:13]
-; CHECK-NEXT:    v_readlane_b32 s71, v5, 39
-; CHECK-NEXT:    v_readlane_b32 s70, v5, 38
-; CHECK-NEXT:    v_readlane_b32 s69, v5, 37
-; CHECK-NEXT:    v_readlane_b32 s68, v5, 36
+; CHECK-NEXT:    v_readlane_b32 s69, v5, 21
+; CHECK-NEXT:    v_readlane_b32 s68, v5, 20
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s67, v5, 35
-; CHECK-NEXT:    v_readlane_b32 s66, v5, 34
-; CHECK-NEXT:    v_readlane_b32 s65, v5, 33
-; CHECK-NEXT:    v_readlane_b32 s64, v5, 32
-; CHECK-NEXT:    v_readlane_b32 s63, v5, 31
-; CHECK-NEXT:    v_readlane_b32 s62, v5, 30
-; CHECK-NEXT:    v_readlane_b32 s61, v5, 29
-; CHECK-NEXT:    v_readlane_b32 s60, v5, 28
-; CHECK-NEXT:    v_readlane_b32 s59, v5, 27
-; CHECK-NEXT:    v_readlane_b32 s58, v5, 26
-; CHECK-NEXT:    v_readlane_b32 s57, v5, 25
-; CHECK-NEXT:    v_readlane_b32 s56, v5, 24
-; CHECK-NEXT:    v_readlane_b32 s55, v5, 23
-; CHECK-NEXT:    v_readlane_b32 s54, v5, 22
-; CHECK-NEXT:    v_readlane_b32 s53, v5, 21
-; CHECK-NEXT:    v_readlane_b32 s52, v5, 20
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 19
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 18
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 17
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 16
-; CHECK-NEXT:    v_readlane_b32 s47, v5, 15
-; CHECK-NEXT:    v_readlane_b32 s46, v5, 14
-; CHECK-NEXT:    v_readlane_b32 s45, v5, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v5, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 8
+; CHECK-NEXT:    v_readlane_b32 s67, v5, 19
+; CHECK-NEXT:    v_readlane_b32 s66, v5, 18
+; CHECK-NEXT:    v_readlane_b32 s65, v5, 17
+; CHECK-NEXT:    v_readlane_b32 s64, v5, 16
+; CHECK-NEXT:    v_readlane_b32 s55, v5, 15
+; CHECK-NEXT:    v_readlane_b32 s54, v5, 14
+; CHECK-NEXT:    v_readlane_b32 s53, v5, 13
+; CHECK-NEXT:    v_readlane_b32 s52, v5, 12
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 11
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 10
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 9
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 8
 ; CHECK-NEXT:    v_readlane_b32 s39, v5, 7
 ; CHECK-NEXT:    v_readlane_b32 s38, v5, 6
 ; CHECK-NEXT:    v_readlane_b32 s37, v5, 5

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 55da485b91f67..d7c4f6afbdade 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -136,55 +136,55 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v40, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    s_mov_b32 s42, s15
-; GCN-NEXT:    s_mov_b32 s43, s14
-; GCN-NEXT:    s_mov_b32 s44, s13
-; GCN-NEXT:    s_mov_b32 s45, s12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 8
+; GCN-NEXT:    v_writelane_b32 v40, s49, 9
+; GCN-NEXT:    v_writelane_b32 v40, s50, 10
+; GCN-NEXT:    v_writelane_b32 v40, s51, 11
+; GCN-NEXT:    v_writelane_b32 v40, s52, 12
+; GCN-NEXT:    v_writelane_b32 v40, s53, 13
+; GCN-NEXT:    v_writelane_b32 v40, s54, 14
+; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s64, 16
+; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    s_mov_b32 s50, s15
+; GCN-NEXT:    s_mov_b32 s51, s14
+; GCN-NEXT:    s_mov_b32 s52, s13
+; GCN-NEXT:    s_mov_b32 s53, s12
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GCN-NEXT:    s_mov_b64 s[46:47], exec
+; GCN-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[54:55], exec
 ; GCN-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s17, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_and_saveexec_b64 s[64:65], vcc
+; GCN-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GCN-NEXT:    s_mov_b32 s12, s45
-; GCN-NEXT:    s_mov_b32 s13, s44
-; GCN-NEXT:    s_mov_b32 s14, s43
-; GCN-NEXT:    s_mov_b32 s15, s42
+; GCN-NEXT:    s_mov_b32 s12, s53
+; GCN-NEXT:    s_mov_b32 s13, s52
+; GCN-NEXT:    s_mov_b32 s14, s51
+; GCN-NEXT:    s_mov_b32 s15, s50
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[64:65]
 ; GCN-NEXT:    s_cbranch_execnz .LBB2_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
+; GCN-NEXT:    s_mov_b64 exec, s[54:55]
+; GCN-NEXT:    v_readlane_b32 s65, v40, 17
+; GCN-NEXT:    v_readlane_b32 s64, v40, 16
+; GCN-NEXT:    v_readlane_b32 s55, v40, 15
+; GCN-NEXT:    v_readlane_b32 s54, v40, 14
+; GCN-NEXT:    v_readlane_b32 s53, v40, 13
+; GCN-NEXT:    v_readlane_b32 s52, v40, 12
+; GCN-NEXT:    v_readlane_b32 s51, v40, 11
+; GCN-NEXT:    v_readlane_b32 s50, v40, 10
+; GCN-NEXT:    v_readlane_b32 s49, v40, 9
+; GCN-NEXT:    v_readlane_b32 s48, v40, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v40, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v40, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 5
@@ -220,55 +220,55 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
 ; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
 ; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    s_mov_b32 s42, s15
-; GISEL-NEXT:    s_mov_b32 s43, s14
-; GISEL-NEXT:    s_mov_b32 s44, s13
-; GISEL-NEXT:    s_mov_b32 s45, s12
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    s_mov_b32 s50, s15
+; GISEL-NEXT:    s_mov_b32 s51, s14
+; GISEL-NEXT:    s_mov_b32 s52, s13
+; GISEL-NEXT:    s_mov_b32 s53, s12
 ; GISEL-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GISEL-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GISEL-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GISEL-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GISEL-NEXT:    s_mov_b64 s[46:47], exec
+; GISEL-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GISEL-NEXT:    s_mov_b64 s[54:55], exec
 ; GISEL-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s16, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s17, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; GISEL-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GISEL-NEXT:    s_and_saveexec_b64 s[64:65], vcc
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GISEL-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GISEL-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GISEL-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GISEL-NEXT:    s_mov_b32 s12, s45
-; GISEL-NEXT:    s_mov_b32 s13, s44
-; GISEL-NEXT:    s_mov_b32 s14, s43
-; GISEL-NEXT:    s_mov_b32 s15, s42
+; GISEL-NEXT:    s_mov_b32 s12, s53
+; GISEL-NEXT:    s_mov_b32 s13, s52
+; GISEL-NEXT:    s_mov_b32 s14, s51
+; GISEL-NEXT:    s_mov_b32 s15, s50
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    ; implicit-def: $vgpr31
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[64:65]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB2_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[46:47]
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
+; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
 ; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
 ; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
@@ -308,58 +308,58 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v40, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    s_mov_b32 s42, s15
-; GCN-NEXT:    s_mov_b32 s43, s14
-; GCN-NEXT:    s_mov_b32 s44, s13
-; GCN-NEXT:    s_mov_b32 s45, s12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 8
+; GCN-NEXT:    v_writelane_b32 v40, s49, 9
+; GCN-NEXT:    v_writelane_b32 v40, s50, 10
+; GCN-NEXT:    v_writelane_b32 v40, s51, 11
+; GCN-NEXT:    v_writelane_b32 v40, s52, 12
+; GCN-NEXT:    v_writelane_b32 v40, s53, 13
+; GCN-NEXT:    v_writelane_b32 v40, s54, 14
+; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s64, 16
+; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    s_mov_b32 s50, s15
+; GCN-NEXT:    s_mov_b32 s51, s14
+; GCN-NEXT:    s_mov_b32 s52, s13
+; GCN-NEXT:    s_mov_b32 s53, s12
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GCN-NEXT:    s_mov_b64 s[46:47], exec
+; GCN-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[54:55], exec
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GCN-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s17, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_and_saveexec_b64 s[64:65], vcc
+; GCN-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GCN-NEXT:    s_mov_b32 s12, s45
-; GCN-NEXT:    s_mov_b32 s13, s44
-; GCN-NEXT:    s_mov_b32 s14, s43
-; GCN-NEXT:    s_mov_b32 s15, s42
+; GCN-NEXT:    s_mov_b32 s12, s53
+; GCN-NEXT:    s_mov_b32 s13, s52
+; GCN-NEXT:    s_mov_b32 s14, s51
+; GCN-NEXT:    s_mov_b32 s15, s50
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    ; implicit-def: $vgpr2
-; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[64:65]
 ; GCN-NEXT:    s_cbranch_execnz .LBB3_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
+; GCN-NEXT:    s_mov_b64 exec, s[54:55]
+; GCN-NEXT:    v_readlane_b32 s65, v40, 17
+; GCN-NEXT:    v_readlane_b32 s64, v40, 16
+; GCN-NEXT:    v_readlane_b32 s55, v40, 15
+; GCN-NEXT:    v_readlane_b32 s54, v40, 14
+; GCN-NEXT:    v_readlane_b32 s53, v40, 13
+; GCN-NEXT:    v_readlane_b32 s52, v40, 12
+; GCN-NEXT:    v_readlane_b32 s51, v40, 11
+; GCN-NEXT:    v_readlane_b32 s50, v40, 10
+; GCN-NEXT:    v_readlane_b32 s49, v40, 9
+; GCN-NEXT:    v_readlane_b32 s48, v40, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v40, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v40, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 5
@@ -395,56 +395,56 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
 ; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
 ; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    s_mov_b32 s42, s15
-; GISEL-NEXT:    s_mov_b32 s43, s14
-; GISEL-NEXT:    s_mov_b32 s44, s13
-; GISEL-NEXT:    s_mov_b32 s45, s12
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    s_mov_b32 s50, s15
+; GISEL-NEXT:    s_mov_b32 s51, s14
+; GISEL-NEXT:    s_mov_b32 s52, s13
+; GISEL-NEXT:    s_mov_b32 s53, s12
 ; GISEL-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GISEL-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GISEL-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GISEL-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GISEL-NEXT:    s_mov_b64 s[46:47], exec
+; GISEL-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GISEL-NEXT:    s_mov_b64 s[54:55], exec
 ; GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s16, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s17, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[48:49], vcc
+; GISEL-NEXT:    s_and_saveexec_b64 s[64:65], vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GISEL-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GISEL-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GISEL-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GISEL-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GISEL-NEXT:    s_mov_b32 s12, s45
-; GISEL-NEXT:    s_mov_b32 s13, s44
-; GISEL-NEXT:    s_mov_b32 s14, s43
-; GISEL-NEXT:    s_mov_b32 s15, s42
+; GISEL-NEXT:    s_mov_b32 s12, s53
+; GISEL-NEXT:    s_mov_b32 s13, s52
+; GISEL-NEXT:    s_mov_b32 s14, s51
+; GISEL-NEXT:    s_mov_b32 s15, s50
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    ; implicit-def: $vgpr31
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[64:65]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB3_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[46:47]
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
+; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
 ; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
 ; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
@@ -484,57 +484,57 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v40, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    s_mov_b32 s42, s15
-; GCN-NEXT:    s_mov_b32 s43, s14
-; GCN-NEXT:    s_mov_b32 s44, s13
-; GCN-NEXT:    s_mov_b32 s45, s12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 8
+; GCN-NEXT:    v_writelane_b32 v40, s49, 9
+; GCN-NEXT:    v_writelane_b32 v40, s50, 10
+; GCN-NEXT:    v_writelane_b32 v40, s51, 11
+; GCN-NEXT:    v_writelane_b32 v40, s52, 12
+; GCN-NEXT:    v_writelane_b32 v40, s53, 13
+; GCN-NEXT:    v_writelane_b32 v40, s54, 14
+; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s64, 16
+; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    s_mov_b32 s50, s15
+; GCN-NEXT:    s_mov_b32 s51, s14
+; GCN-NEXT:    s_mov_b32 s52, s13
+; GCN-NEXT:    s_mov_b32 s53, s12
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GCN-NEXT:    s_mov_b64 s[46:47], exec
+; GCN-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[54:55], exec
 ; GCN-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s17, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_and_saveexec_b64 s[64:65], vcc
+; GCN-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GCN-NEXT:    s_mov_b32 s12, s45
-; GCN-NEXT:    s_mov_b32 s13, s44
-; GCN-NEXT:    s_mov_b32 s14, s43
-; GCN-NEXT:    s_mov_b32 s15, s42
+; GCN-NEXT:    s_mov_b32 s12, s53
+; GCN-NEXT:    s_mov_b32 s13, s52
+; GCN-NEXT:    s_mov_b32 s14, s51
+; GCN-NEXT:    s_mov_b32 s15, s50
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[64:65]
 ; GCN-NEXT:    s_cbranch_execnz .LBB4_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[46:47]
+; GCN-NEXT:    s_mov_b64 exec, s[54:55]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
+; GCN-NEXT:    v_readlane_b32 s65, v40, 17
+; GCN-NEXT:    v_readlane_b32 s64, v40, 16
+; GCN-NEXT:    v_readlane_b32 s55, v40, 15
+; GCN-NEXT:    v_readlane_b32 s54, v40, 14
+; GCN-NEXT:    v_readlane_b32 s53, v40, 13
+; GCN-NEXT:    v_readlane_b32 s52, v40, 12
+; GCN-NEXT:    v_readlane_b32 s51, v40, 11
+; GCN-NEXT:    v_readlane_b32 s50, v40, 10
+; GCN-NEXT:    v_readlane_b32 s49, v40, 9
+; GCN-NEXT:    v_readlane_b32 s48, v40, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v40, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v40, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 5
@@ -570,57 +570,57 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
 ; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
 ; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    s_mov_b32 s42, s15
-; GISEL-NEXT:    s_mov_b32 s43, s14
-; GISEL-NEXT:    s_mov_b32 s44, s13
-; GISEL-NEXT:    s_mov_b32 s45, s12
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    s_mov_b32 s50, s15
+; GISEL-NEXT:    s_mov_b32 s51, s14
+; GISEL-NEXT:    s_mov_b32 s52, s13
+; GISEL-NEXT:    s_mov_b32 s53, s12
 ; GISEL-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GISEL-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GISEL-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GISEL-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GISEL-NEXT:    s_mov_b64 s[46:47], exec
+; GISEL-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GISEL-NEXT:    s_mov_b64 s[54:55], exec
 ; GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s16, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s17, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; GISEL-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GISEL-NEXT:    s_and_saveexec_b64 s[64:65], vcc
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GISEL-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GISEL-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GISEL-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GISEL-NEXT:    s_mov_b32 s12, s45
-; GISEL-NEXT:    s_mov_b32 s13, s44
-; GISEL-NEXT:    s_mov_b32 s14, s43
-; GISEL-NEXT:    s_mov_b32 s15, s42
+; GISEL-NEXT:    s_mov_b32 s12, s53
+; GISEL-NEXT:    s_mov_b32 s13, s52
+; GISEL-NEXT:    s_mov_b32 s14, s51
+; GISEL-NEXT:    s_mov_b32 s15, s50
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v0
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    ; implicit-def: $vgpr31
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[64:65]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB4_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[46:47]
+; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v1
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
 ; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
 ; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
@@ -661,66 +661,66 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v40, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    s_mov_b32 s42, s15
-; GCN-NEXT:    s_mov_b32 s43, s14
-; GCN-NEXT:    s_mov_b32 s44, s13
-; GCN-NEXT:    s_mov_b32 s45, s12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 8
+; GCN-NEXT:    v_writelane_b32 v40, s49, 9
+; GCN-NEXT:    v_writelane_b32 v40, s50, 10
+; GCN-NEXT:    v_writelane_b32 v40, s51, 11
+; GCN-NEXT:    v_writelane_b32 v40, s52, 12
+; GCN-NEXT:    v_writelane_b32 v40, s53, 13
+; GCN-NEXT:    v_writelane_b32 v40, s54, 14
+; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s64, 16
+; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s66, 18
+; GCN-NEXT:    v_writelane_b32 v40, s67, 19
+; GCN-NEXT:    s_mov_b32 s50, s15
+; GCN-NEXT:    s_mov_b32 s51, s14
+; GCN-NEXT:    s_mov_b32 s52, s13
+; GCN-NEXT:    s_mov_b32 s53, s12
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; GCN-NEXT:    s_and_saveexec_b64 s[54:55], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB5_4
 ; GCN-NEXT:  ; %bb.1: ; %bb1
-; GCN-NEXT:    s_mov_b64 s[48:49], exec
+; GCN-NEXT:    s_mov_b64 s[64:65], exec
 ; GCN-NEXT:  .LBB5_2: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s17, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[50:51], vcc
-; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_and_saveexec_b64 s[66:67], vcc
+; GCN-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GCN-NEXT:    s_mov_b32 s12, s45
-; GCN-NEXT:    s_mov_b32 s13, s44
-; GCN-NEXT:    s_mov_b32 s14, s43
-; GCN-NEXT:    s_mov_b32 s15, s42
+; GCN-NEXT:    s_mov_b32 s12, s53
+; GCN-NEXT:    s_mov_b32 s13, s52
+; GCN-NEXT:    s_mov_b32 s14, s51
+; GCN-NEXT:    s_mov_b32 s15, s50
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    s_xor_b64 exec, exec, s[50:51]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[66:67]
 ; GCN-NEXT:    s_cbranch_execnz .LBB5_2
 ; GCN-NEXT:  ; %bb.3:
-; GCN-NEXT:    s_mov_b64 exec, s[48:49]
+; GCN-NEXT:    s_mov_b64 exec, s[64:65]
 ; GCN-NEXT:  .LBB5_4: ; %bb2
-; GCN-NEXT:    s_or_b64 exec, exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
+; GCN-NEXT:    s_or_b64 exec, exec, s[54:55]
+; GCN-NEXT:    v_readlane_b32 s67, v40, 19
+; GCN-NEXT:    v_readlane_b32 s66, v40, 18
+; GCN-NEXT:    v_readlane_b32 s65, v40, 17
+; GCN-NEXT:    v_readlane_b32 s64, v40, 16
+; GCN-NEXT:    v_readlane_b32 s55, v40, 15
+; GCN-NEXT:    v_readlane_b32 s54, v40, 14
+; GCN-NEXT:    v_readlane_b32 s53, v40, 13
+; GCN-NEXT:    v_readlane_b32 s52, v40, 12
+; GCN-NEXT:    v_readlane_b32 s51, v40, 11
+; GCN-NEXT:    v_readlane_b32 s50, v40, 10
+; GCN-NEXT:    v_readlane_b32 s49, v40, 9
+; GCN-NEXT:    v_readlane_b32 s48, v40, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v40, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v40, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 5
@@ -756,66 +756,66 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
 ; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
 ; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    s_mov_b32 s42, s15
-; GISEL-NEXT:    s_mov_b32 s43, s14
-; GISEL-NEXT:    s_mov_b32 s44, s13
-; GISEL-NEXT:    s_mov_b32 s45, s12
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s66, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s67, 19
+; GISEL-NEXT:    s_mov_b32 s50, s15
+; GISEL-NEXT:    s_mov_b32 s51, s14
+; GISEL-NEXT:    s_mov_b32 s52, s13
+; GISEL-NEXT:    s_mov_b32 s53, s12
 ; GISEL-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GISEL-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GISEL-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GISEL-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GISEL-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GISEL-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; GISEL-NEXT:    s_and_saveexec_b64 s[54:55], vcc
 ; GISEL-NEXT:    s_cbranch_execz .LBB5_4
 ; GISEL-NEXT:  ; %bb.1: ; %bb1
-; GISEL-NEXT:    s_mov_b64 s[48:49], exec
+; GISEL-NEXT:    s_mov_b64 s[64:65], exec
 ; GISEL-NEXT:  .LBB5_2: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s16, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s17, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[50:51], vcc
-; GISEL-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GISEL-NEXT:    s_and_saveexec_b64 s[66:67], vcc
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GISEL-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GISEL-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GISEL-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GISEL-NEXT:    s_mov_b32 s12, s45
-; GISEL-NEXT:    s_mov_b32 s13, s44
-; GISEL-NEXT:    s_mov_b32 s14, s43
-; GISEL-NEXT:    s_mov_b32 s15, s42
+; GISEL-NEXT:    s_mov_b32 s12, s53
+; GISEL-NEXT:    s_mov_b32 s13, s52
+; GISEL-NEXT:    s_mov_b32 s14, s51
+; GISEL-NEXT:    s_mov_b32 s15, s50
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    ; implicit-def: $vgpr31
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[50:51]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[66:67]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB5_2
 ; GISEL-NEXT:  ; %bb.3:
-; GISEL-NEXT:    s_mov_b64 exec, s[48:49]
+; GISEL-NEXT:    s_mov_b64 exec, s[64:65]
 ; GISEL-NEXT:  .LBB5_4: ; %bb2
-; GISEL-NEXT:    s_or_b64 exec, exec, s[46:47]
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
+; GISEL-NEXT:    s_or_b64 exec, exec, s[54:55]
+; GISEL-NEXT:    v_readlane_b32 s67, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s66, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
 ; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
 ; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
@@ -861,30 +861,14 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v40, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
-; GCN-NEXT:    v_writelane_b32 v40, s60, 28
-; GCN-NEXT:    v_writelane_b32 v40, s61, 29
-; GCN-NEXT:    v_writelane_b32 v40, s62, 30
-; GCN-NEXT:    v_writelane_b32 v40, s63, 31
+; GCN-NEXT:    v_writelane_b32 v40, s48, 8
+; GCN-NEXT:    v_writelane_b32 v40, s49, 9
+; GCN-NEXT:    v_writelane_b32 v40, s50, 10
+; GCN-NEXT:    v_writelane_b32 v40, s51, 11
+; GCN-NEXT:    v_writelane_b32 v40, s52, 12
+; GCN-NEXT:    v_writelane_b32 v40, s53, 13
+; GCN-NEXT:    v_writelane_b32 v40, s54, 14
+; GCN-NEXT:    v_writelane_b32 v40, s55, 15
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
@@ -898,30 +882,14 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB6_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    v_readlane_b32 s63, v40, 31
-; GCN-NEXT:    v_readlane_b32 s62, v40, 30
-; GCN-NEXT:    v_readlane_b32 s61, v40, 29
-; GCN-NEXT:    v_readlane_b32 s60, v40, 28
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
+; GCN-NEXT:    v_readlane_b32 s55, v40, 15
+; GCN-NEXT:    v_readlane_b32 s54, v40, 14
+; GCN-NEXT:    v_readlane_b32 s53, v40, 13
+; GCN-NEXT:    v_readlane_b32 s52, v40, 12
+; GCN-NEXT:    v_readlane_b32 s51, v40, 11
+; GCN-NEXT:    v_readlane_b32 s50, v40, 10
+; GCN-NEXT:    v_readlane_b32 s49, v40, 9
+; GCN-NEXT:    v_readlane_b32 s48, v40, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v40, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v40, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 5
@@ -955,30 +923,14 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
 ; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
 ; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v40, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v40, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
@@ -992,30 +944,14 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB6_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v40, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v40, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v40, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
 ; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
 ; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
@@ -1054,30 +990,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v41, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v41, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v41, s39, 7
-; GCN-NEXT:    v_writelane_b32 v41, s40, 8
-; GCN-NEXT:    v_writelane_b32 v41, s41, 9
-; GCN-NEXT:    v_writelane_b32 v41, s42, 10
-; GCN-NEXT:    v_writelane_b32 v41, s43, 11
-; GCN-NEXT:    v_writelane_b32 v41, s44, 12
-; GCN-NEXT:    v_writelane_b32 v41, s45, 13
-; GCN-NEXT:    v_writelane_b32 v41, s46, 14
-; GCN-NEXT:    v_writelane_b32 v41, s47, 15
-; GCN-NEXT:    v_writelane_b32 v41, s48, 16
-; GCN-NEXT:    v_writelane_b32 v41, s49, 17
-; GCN-NEXT:    v_writelane_b32 v41, s50, 18
-; GCN-NEXT:    v_writelane_b32 v41, s51, 19
-; GCN-NEXT:    v_writelane_b32 v41, s52, 20
-; GCN-NEXT:    v_writelane_b32 v41, s53, 21
-; GCN-NEXT:    v_writelane_b32 v41, s54, 22
-; GCN-NEXT:    v_writelane_b32 v41, s55, 23
-; GCN-NEXT:    v_writelane_b32 v41, s56, 24
-; GCN-NEXT:    v_writelane_b32 v41, s57, 25
-; GCN-NEXT:    v_writelane_b32 v41, s58, 26
-; GCN-NEXT:    v_writelane_b32 v41, s59, 27
-; GCN-NEXT:    v_writelane_b32 v41, s60, 28
-; GCN-NEXT:    v_writelane_b32 v41, s61, 29
-; GCN-NEXT:    v_writelane_b32 v41, s62, 30
-; GCN-NEXT:    v_writelane_b32 v41, s63, 31
+; GCN-NEXT:    v_writelane_b32 v41, s48, 8
+; GCN-NEXT:    v_writelane_b32 v41, s49, 9
+; GCN-NEXT:    v_writelane_b32 v41, s50, 10
+; GCN-NEXT:    v_writelane_b32 v41, s51, 11
+; GCN-NEXT:    v_writelane_b32 v41, s52, 12
+; GCN-NEXT:    v_writelane_b32 v41, s53, 13
+; GCN-NEXT:    v_writelane_b32 v41, s54, 14
+; GCN-NEXT:    v_writelane_b32 v41, s55, 15
 ; GCN-NEXT:    v_mov_b32_e32 v40, v0
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1093,30 +1013,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v40
-; GCN-NEXT:    v_readlane_b32 s63, v41, 31
-; GCN-NEXT:    v_readlane_b32 s62, v41, 30
-; GCN-NEXT:    v_readlane_b32 s61, v41, 29
-; GCN-NEXT:    v_readlane_b32 s60, v41, 28
-; GCN-NEXT:    v_readlane_b32 s59, v41, 27
-; GCN-NEXT:    v_readlane_b32 s58, v41, 26
-; GCN-NEXT:    v_readlane_b32 s57, v41, 25
-; GCN-NEXT:    v_readlane_b32 s56, v41, 24
-; GCN-NEXT:    v_readlane_b32 s55, v41, 23
-; GCN-NEXT:    v_readlane_b32 s54, v41, 22
-; GCN-NEXT:    v_readlane_b32 s53, v41, 21
-; GCN-NEXT:    v_readlane_b32 s52, v41, 20
-; GCN-NEXT:    v_readlane_b32 s51, v41, 19
-; GCN-NEXT:    v_readlane_b32 s50, v41, 18
-; GCN-NEXT:    v_readlane_b32 s49, v41, 17
-; GCN-NEXT:    v_readlane_b32 s48, v41, 16
-; GCN-NEXT:    v_readlane_b32 s47, v41, 15
-; GCN-NEXT:    v_readlane_b32 s46, v41, 14
-; GCN-NEXT:    v_readlane_b32 s45, v41, 13
-; GCN-NEXT:    v_readlane_b32 s44, v41, 12
-; GCN-NEXT:    v_readlane_b32 s43, v41, 11
-; GCN-NEXT:    v_readlane_b32 s42, v41, 10
-; GCN-NEXT:    v_readlane_b32 s41, v41, 9
-; GCN-NEXT:    v_readlane_b32 s40, v41, 8
+; GCN-NEXT:    v_readlane_b32 s55, v41, 15
+; GCN-NEXT:    v_readlane_b32 s54, v41, 14
+; GCN-NEXT:    v_readlane_b32 s53, v41, 13
+; GCN-NEXT:    v_readlane_b32 s52, v41, 12
+; GCN-NEXT:    v_readlane_b32 s51, v41, 11
+; GCN-NEXT:    v_readlane_b32 s50, v41, 10
+; GCN-NEXT:    v_readlane_b32 s49, v41, 9
+; GCN-NEXT:    v_readlane_b32 s48, v41, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v41, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v41, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v41, 5
@@ -1152,30 +1056,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v41, s37, 5
 ; GISEL-NEXT:    v_writelane_b32 v41, s38, 6
 ; GISEL-NEXT:    v_writelane_b32 v41, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v41, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v41, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v41, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v41, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v41, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v41, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v41, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v41, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v41, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v41, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v41, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v41, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v41, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v41, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v41, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v41, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v41, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v41, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v41, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v41, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v41, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v41, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v41, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v41, s63, 31
+; GISEL-NEXT:    v_writelane_b32 v41, s48, 8
+; GISEL-NEXT:    v_writelane_b32 v41, s49, 9
+; GISEL-NEXT:    v_writelane_b32 v41, s50, 10
+; GISEL-NEXT:    v_writelane_b32 v41, s51, 11
+; GISEL-NEXT:    v_writelane_b32 v41, s52, 12
+; GISEL-NEXT:    v_writelane_b32 v41, s53, 13
+; GISEL-NEXT:    v_writelane_b32 v41, s54, 14
+; GISEL-NEXT:    v_writelane_b32 v41, s55, 15
 ; GISEL-NEXT:    v_mov_b32_e32 v40, v0
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1191,30 +1079,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v40
-; GISEL-NEXT:    v_readlane_b32 s63, v41, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v41, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v41, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v41, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v41, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v41, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v41, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v41, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v41, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v41, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v41, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v41, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v41, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v41, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v41, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v41, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v41, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v41, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v41, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v41, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v41, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v41, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v41, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v41, 8
+; GISEL-NEXT:    v_readlane_b32 s55, v41, 15
+; GISEL-NEXT:    v_readlane_b32 s54, v41, 14
+; GISEL-NEXT:    v_readlane_b32 s53, v41, 13
+; GISEL-NEXT:    v_readlane_b32 s52, v41, 12
+; GISEL-NEXT:    v_readlane_b32 s51, v41, 11
+; GISEL-NEXT:    v_readlane_b32 s50, v41, 10
+; GISEL-NEXT:    v_readlane_b32 s49, v41, 9
+; GISEL-NEXT:    v_readlane_b32 s48, v41, 8
 ; GISEL-NEXT:    v_readlane_b32 s39, v41, 7
 ; GISEL-NEXT:    v_readlane_b32 s38, v41, 6
 ; GISEL-NEXT:    v_readlane_b32 s37, v41, 5
@@ -1257,30 +1129,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v40, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
-; GCN-NEXT:    v_writelane_b32 v40, s60, 28
-; GCN-NEXT:    v_writelane_b32 v40, s61, 29
-; GCN-NEXT:    v_writelane_b32 v40, s62, 30
-; GCN-NEXT:    v_writelane_b32 v40, s63, 31
+; GCN-NEXT:    v_writelane_b32 v40, s48, 8
+; GCN-NEXT:    v_writelane_b32 v40, s49, 9
+; GCN-NEXT:    v_writelane_b32 v40, s50, 10
+; GCN-NEXT:    v_writelane_b32 v40, s51, 11
+; GCN-NEXT:    v_writelane_b32 v40, s52, 12
+; GCN-NEXT:    v_writelane_b32 v40, s53, 13
+; GCN-NEXT:    v_writelane_b32 v40, s54, 14
+; GCN-NEXT:    v_writelane_b32 v40, s55, 15
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
@@ -1296,30 +1152,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
-; GCN-NEXT:    v_readlane_b32 s63, v40, 31
-; GCN-NEXT:    v_readlane_b32 s62, v40, 30
-; GCN-NEXT:    v_readlane_b32 s61, v40, 29
-; GCN-NEXT:    v_readlane_b32 s60, v40, 28
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
+; GCN-NEXT:    v_readlane_b32 s55, v40, 15
+; GCN-NEXT:    v_readlane_b32 s54, v40, 14
+; GCN-NEXT:    v_readlane_b32 s53, v40, 13
+; GCN-NEXT:    v_readlane_b32 s52, v40, 12
+; GCN-NEXT:    v_readlane_b32 s51, v40, 11
+; GCN-NEXT:    v_readlane_b32 s50, v40, 10
+; GCN-NEXT:    v_readlane_b32 s49, v40, 9
+; GCN-NEXT:    v_readlane_b32 s48, v40, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v40, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v40, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 5
@@ -1353,30 +1193,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
 ; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
 ; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v40, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v40, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v1
@@ -1392,30 +1216,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v40, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v40, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v40, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
 ; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
 ; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
@@ -1454,30 +1262,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v40, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
-; GCN-NEXT:    v_writelane_b32 v40, s60, 28
-; GCN-NEXT:    v_writelane_b32 v40, s61, 29
-; GCN-NEXT:    v_writelane_b32 v40, s62, 30
-; GCN-NEXT:    v_writelane_b32 v40, s63, 31
+; GCN-NEXT:    v_writelane_b32 v40, s48, 8
+; GCN-NEXT:    v_writelane_b32 v40, s49, 9
+; GCN-NEXT:    v_writelane_b32 v40, s50, 10
+; GCN-NEXT:    v_writelane_b32 v40, s51, 11
+; GCN-NEXT:    v_writelane_b32 v40, s52, 12
+; GCN-NEXT:    v_writelane_b32 v40, s53, 13
+; GCN-NEXT:    v_writelane_b32 v40, s54, 14
+; GCN-NEXT:    v_writelane_b32 v40, s55, 15
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v0
@@ -1490,30 +1282,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB9_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_readlane_b32 s63, v40, 31
-; GCN-NEXT:    v_readlane_b32 s62, v40, 30
-; GCN-NEXT:    v_readlane_b32 s61, v40, 29
-; GCN-NEXT:    v_readlane_b32 s60, v40, 28
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
+; GCN-NEXT:    v_readlane_b32 s55, v40, 15
+; GCN-NEXT:    v_readlane_b32 s54, v40, 14
+; GCN-NEXT:    v_readlane_b32 s53, v40, 13
+; GCN-NEXT:    v_readlane_b32 s52, v40, 12
+; GCN-NEXT:    v_readlane_b32 s51, v40, 11
+; GCN-NEXT:    v_readlane_b32 s50, v40, 10
+; GCN-NEXT:    v_readlane_b32 s49, v40, 9
+; GCN-NEXT:    v_readlane_b32 s48, v40, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v40, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v40, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 5
@@ -1547,30 +1323,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
 ; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
 ; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v40, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v40, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s6, v0
@@ -1583,30 +1343,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB9_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v40, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v40, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v40, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
 ; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
 ; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 5

diff  --git a/llvm/test/CodeGen/AMDGPU/issue48473.mir b/llvm/test/CodeGen/AMDGPU/issue48473.mir
index dd73e65de7cb6..654461a62fa9f 100644
--- a/llvm/test/CodeGen/AMDGPU/issue48473.mir
+++ b/llvm/test/CodeGen/AMDGPU/issue48473.mir
@@ -43,7 +43,7 @@
 # %25 to $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 
 # CHECK-LABEL: name: issue48473
-# CHECK: S_NOP 0, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, implicit killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, implicit killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, implicit killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, implicit killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, implicit killed renamable $sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, implicit killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, implicit killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, implicit killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+# CHECK: S_NOP 0, implicit killed renamable $sgpr20_sgpr21_sgpr22_sgpr23, implicit killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit killed renamable $sgpr40_sgpr41_sgpr42_sgpr43, implicit killed renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit killed renamable $sgpr36_sgpr37_sgpr38_sgpr39, implicit killed renamable $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, implicit killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, implicit killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit killed renamable $sgpr16_sgpr17_sgpr18_sgpr19, implicit killed renamable $sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, implicit killed renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, implicit renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr44_sgpr45_sgpr46_sgpr47, implicit killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
 
 ---
 name:            issue48473

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index a3bd0aabd5c3f..55fa02a0c582c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -815,50 +815,27 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
 ; CHECK-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-SDAG-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; CHECK-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s36, 0
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s37, 1
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s38, 2
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s39, 3
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s40, 4
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s41, 5
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s42, 6
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s43, 7
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s44, 8
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s45, 9
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s46, 10
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s47, 11
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s48, 12
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s49, 13
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s50, 14
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s51, 15
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s52, 16
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s53, 17
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s54, 18
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s55, 19
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s56, 20
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s57, 21
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s58, 22
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s59, 23
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s60, 24
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s61, 25
 ; CHECK-SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; CHECK-SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s61, v27
 ; CHECK-SDAG-NEXT:    buffer_load_dword v27, off, s[0:3], s32
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s62, 26
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s63, 27
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s64, 28
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s65, 29
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s66, 30
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s67, 31
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s36, 0
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s37, 1
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s38, 2
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s39, 3
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s48, 4
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s49, 5
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s50, 6
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s51, 7
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s52, 8
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s53, 9
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s54, 10
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s55, 11
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s64, 12
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s65, 13
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s66, 14
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s67, 15
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s64, v30
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s63, v29
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s62, v28
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s60, v26
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s59, v25
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s58, v24
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s57, v23
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s56, v22
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s55, v21
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s54, v20
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s53, v19
@@ -867,6 +844,17 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s50, v16
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s49, v15
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s48, v14
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s39, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s38, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s37, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s36, v2
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s63, v29
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s62, v28
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s60, v26
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s59, v25
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s58, v24
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s57, v23
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s56, v22
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s47, v13
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s46, v12
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s45, v11
@@ -875,10 +863,6 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s42, v8
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s41, v7
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s40, v6
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s39, v5
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s38, v4
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s37, v3
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s36, v2
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s66, v0
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(1)
@@ -888,34 +872,18 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; use s[36:67]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_readlane_b32 s67, v31, 31
-; CHECK-SDAG-NEXT:    v_readlane_b32 s66, v31, 30
-; CHECK-SDAG-NEXT:    v_readlane_b32 s65, v31, 29
-; CHECK-SDAG-NEXT:    v_readlane_b32 s64, v31, 28
-; CHECK-SDAG-NEXT:    v_readlane_b32 s63, v31, 27
-; CHECK-SDAG-NEXT:    v_readlane_b32 s62, v31, 26
-; CHECK-SDAG-NEXT:    v_readlane_b32 s61, v31, 25
-; CHECK-SDAG-NEXT:    v_readlane_b32 s60, v31, 24
-; CHECK-SDAG-NEXT:    v_readlane_b32 s59, v31, 23
-; CHECK-SDAG-NEXT:    v_readlane_b32 s58, v31, 22
-; CHECK-SDAG-NEXT:    v_readlane_b32 s57, v31, 21
-; CHECK-SDAG-NEXT:    v_readlane_b32 s56, v31, 20
-; CHECK-SDAG-NEXT:    v_readlane_b32 s55, v31, 19
-; CHECK-SDAG-NEXT:    v_readlane_b32 s54, v31, 18
-; CHECK-SDAG-NEXT:    v_readlane_b32 s53, v31, 17
-; CHECK-SDAG-NEXT:    v_readlane_b32 s52, v31, 16
-; CHECK-SDAG-NEXT:    v_readlane_b32 s51, v31, 15
-; CHECK-SDAG-NEXT:    v_readlane_b32 s50, v31, 14
-; CHECK-SDAG-NEXT:    v_readlane_b32 s49, v31, 13
-; CHECK-SDAG-NEXT:    v_readlane_b32 s48, v31, 12
-; CHECK-SDAG-NEXT:    v_readlane_b32 s47, v31, 11
-; CHECK-SDAG-NEXT:    v_readlane_b32 s46, v31, 10
-; CHECK-SDAG-NEXT:    v_readlane_b32 s45, v31, 9
-; CHECK-SDAG-NEXT:    v_readlane_b32 s44, v31, 8
-; CHECK-SDAG-NEXT:    v_readlane_b32 s43, v31, 7
-; CHECK-SDAG-NEXT:    v_readlane_b32 s42, v31, 6
-; CHECK-SDAG-NEXT:    v_readlane_b32 s41, v31, 5
-; CHECK-SDAG-NEXT:    v_readlane_b32 s40, v31, 4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s67, v31, 15
+; CHECK-SDAG-NEXT:    v_readlane_b32 s66, v31, 14
+; CHECK-SDAG-NEXT:    v_readlane_b32 s65, v31, 13
+; CHECK-SDAG-NEXT:    v_readlane_b32 s64, v31, 12
+; CHECK-SDAG-NEXT:    v_readlane_b32 s55, v31, 11
+; CHECK-SDAG-NEXT:    v_readlane_b32 s54, v31, 10
+; CHECK-SDAG-NEXT:    v_readlane_b32 s53, v31, 9
+; CHECK-SDAG-NEXT:    v_readlane_b32 s52, v31, 8
+; CHECK-SDAG-NEXT:    v_readlane_b32 s51, v31, 7
+; CHECK-SDAG-NEXT:    v_readlane_b32 s50, v31, 6
+; CHECK-SDAG-NEXT:    v_readlane_b32 s49, v31, 5
+; CHECK-SDAG-NEXT:    v_readlane_b32 s48, v31, 4
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s39, v31, 3
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s38, v31, 2
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s37, v31, 1
@@ -940,45 +908,21 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
 ; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s37, 1
 ; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s38, 2
 ; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s39, 3
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s40, 4
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s41, 5
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s42, 6
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s43, 7
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s44, 8
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s45, 9
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s46, 10
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s47, 11
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s48, 12
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s49, 13
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s50, 14
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s51, 15
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s52, 16
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s53, 17
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s54, 18
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s55, 19
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s56, 20
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s57, 21
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s58, 22
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s59, 23
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s60, 24
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s61, 25
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s62, 26
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s63, 27
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s64, 28
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s65, 29
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s66, 30
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s67, 31
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s48, 4
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s49, 5
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s50, 6
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s51, 7
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s52, 8
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s53, 9
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s54, 10
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s55, 11
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s64, 12
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s65, 13
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s66, 14
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s67, 15
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s37, v3
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s38, v4
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s39, v5
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s40, v6
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s41, v7
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s42, v8
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s43, v9
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s44, v10
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s45, v11
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s46, v12
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s47, v13
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s48, v14
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s49, v15
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s50, v16
@@ -987,6 +931,15 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s53, v19
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s54, v20
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s55, v21
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s64, v30
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s40, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s41, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s42, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s43, v9
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s44, v10
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s45, v11
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s46, v12
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s47, v13
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s56, v22
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s57, v23
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s58, v24
@@ -995,7 +948,6 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s61, v27
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s62, v28
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s63, v29
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s64, v30
 ; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s65, v0
 ; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1005,34 +957,18 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; use s[36:67]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_readlane_b32 s67, v31, 31
-; CHECK-GISEL-NEXT:    v_readlane_b32 s66, v31, 30
-; CHECK-GISEL-NEXT:    v_readlane_b32 s65, v31, 29
-; CHECK-GISEL-NEXT:    v_readlane_b32 s64, v31, 28
-; CHECK-GISEL-NEXT:    v_readlane_b32 s63, v31, 27
-; CHECK-GISEL-NEXT:    v_readlane_b32 s62, v31, 26
-; CHECK-GISEL-NEXT:    v_readlane_b32 s61, v31, 25
-; CHECK-GISEL-NEXT:    v_readlane_b32 s60, v31, 24
-; CHECK-GISEL-NEXT:    v_readlane_b32 s59, v31, 23
-; CHECK-GISEL-NEXT:    v_readlane_b32 s58, v31, 22
-; CHECK-GISEL-NEXT:    v_readlane_b32 s57, v31, 21
-; CHECK-GISEL-NEXT:    v_readlane_b32 s56, v31, 20
-; CHECK-GISEL-NEXT:    v_readlane_b32 s55, v31, 19
-; CHECK-GISEL-NEXT:    v_readlane_b32 s54, v31, 18
-; CHECK-GISEL-NEXT:    v_readlane_b32 s53, v31, 17
-; CHECK-GISEL-NEXT:    v_readlane_b32 s52, v31, 16
-; CHECK-GISEL-NEXT:    v_readlane_b32 s51, v31, 15
-; CHECK-GISEL-NEXT:    v_readlane_b32 s50, v31, 14
-; CHECK-GISEL-NEXT:    v_readlane_b32 s49, v31, 13
-; CHECK-GISEL-NEXT:    v_readlane_b32 s48, v31, 12
-; CHECK-GISEL-NEXT:    v_readlane_b32 s47, v31, 11
-; CHECK-GISEL-NEXT:    v_readlane_b32 s46, v31, 10
-; CHECK-GISEL-NEXT:    v_readlane_b32 s45, v31, 9
-; CHECK-GISEL-NEXT:    v_readlane_b32 s44, v31, 8
-; CHECK-GISEL-NEXT:    v_readlane_b32 s43, v31, 7
-; CHECK-GISEL-NEXT:    v_readlane_b32 s42, v31, 6
-; CHECK-GISEL-NEXT:    v_readlane_b32 s41, v31, 5
-; CHECK-GISEL-NEXT:    v_readlane_b32 s40, v31, 4
+; CHECK-GISEL-NEXT:    v_readlane_b32 s67, v31, 15
+; CHECK-GISEL-NEXT:    v_readlane_b32 s66, v31, 14
+; CHECK-GISEL-NEXT:    v_readlane_b32 s65, v31, 13
+; CHECK-GISEL-NEXT:    v_readlane_b32 s64, v31, 12
+; CHECK-GISEL-NEXT:    v_readlane_b32 s55, v31, 11
+; CHECK-GISEL-NEXT:    v_readlane_b32 s54, v31, 10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s53, v31, 9
+; CHECK-GISEL-NEXT:    v_readlane_b32 s52, v31, 8
+; CHECK-GISEL-NEXT:    v_readlane_b32 s51, v31, 7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s50, v31, 6
+; CHECK-GISEL-NEXT:    v_readlane_b32 s49, v31, 5
+; CHECK-GISEL-NEXT:    v_readlane_b32 s48, v31, 4
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s39, v31, 3
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s38, v31, 2
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s37, v31, 1
@@ -1324,50 +1260,27 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
 ; CHECK-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-SDAG-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; CHECK-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s36, 0
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s37, 1
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s38, 2
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s39, 3
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s40, 4
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s41, 5
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s42, 6
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s43, 7
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s44, 8
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s45, 9
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s46, 10
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s47, 11
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s48, 12
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s49, 13
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s50, 14
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s51, 15
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s52, 16
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s53, 17
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s54, 18
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s55, 19
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s56, 20
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s57, 21
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s58, 22
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s59, 23
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s60, 24
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s61, 25
 ; CHECK-SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; CHECK-SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s61, v27
 ; CHECK-SDAG-NEXT:    buffer_load_dword v27, off, s[0:3], s32
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s62, 26
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s63, 27
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s64, 28
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s65, 29
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s66, 30
-; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s67, 31
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s36, 0
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s37, 1
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s38, 2
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s39, 3
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s48, 4
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s49, 5
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s50, 6
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s51, 7
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s52, 8
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s53, 9
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s54, 10
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s55, 11
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s64, 12
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s65, 13
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s66, 14
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s67, 15
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s64, v30
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s63, v29
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s62, v28
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s60, v26
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s59, v25
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s58, v24
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s57, v23
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s56, v22
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s55, v21
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s54, v20
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s53, v19
@@ -1376,6 +1289,17 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s50, v16
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s49, v15
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s48, v14
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s39, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s38, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s37, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s36, v2
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s63, v29
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s62, v28
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s60, v26
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s59, v25
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s58, v24
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s57, v23
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s56, v22
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s47, v13
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s46, v12
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s45, v11
@@ -1384,10 +1308,6 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s42, v8
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s41, v7
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s40, v6
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s39, v5
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s38, v4
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s37, v3
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s36, v2
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s66, v0
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(1)
@@ -1397,34 +1317,18 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; use s[36:67]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_readlane_b32 s67, v31, 31
-; CHECK-SDAG-NEXT:    v_readlane_b32 s66, v31, 30
-; CHECK-SDAG-NEXT:    v_readlane_b32 s65, v31, 29
-; CHECK-SDAG-NEXT:    v_readlane_b32 s64, v31, 28
-; CHECK-SDAG-NEXT:    v_readlane_b32 s63, v31, 27
-; CHECK-SDAG-NEXT:    v_readlane_b32 s62, v31, 26
-; CHECK-SDAG-NEXT:    v_readlane_b32 s61, v31, 25
-; CHECK-SDAG-NEXT:    v_readlane_b32 s60, v31, 24
-; CHECK-SDAG-NEXT:    v_readlane_b32 s59, v31, 23
-; CHECK-SDAG-NEXT:    v_readlane_b32 s58, v31, 22
-; CHECK-SDAG-NEXT:    v_readlane_b32 s57, v31, 21
-; CHECK-SDAG-NEXT:    v_readlane_b32 s56, v31, 20
-; CHECK-SDAG-NEXT:    v_readlane_b32 s55, v31, 19
-; CHECK-SDAG-NEXT:    v_readlane_b32 s54, v31, 18
-; CHECK-SDAG-NEXT:    v_readlane_b32 s53, v31, 17
-; CHECK-SDAG-NEXT:    v_readlane_b32 s52, v31, 16
-; CHECK-SDAG-NEXT:    v_readlane_b32 s51, v31, 15
-; CHECK-SDAG-NEXT:    v_readlane_b32 s50, v31, 14
-; CHECK-SDAG-NEXT:    v_readlane_b32 s49, v31, 13
-; CHECK-SDAG-NEXT:    v_readlane_b32 s48, v31, 12
-; CHECK-SDAG-NEXT:    v_readlane_b32 s47, v31, 11
-; CHECK-SDAG-NEXT:    v_readlane_b32 s46, v31, 10
-; CHECK-SDAG-NEXT:    v_readlane_b32 s45, v31, 9
-; CHECK-SDAG-NEXT:    v_readlane_b32 s44, v31, 8
-; CHECK-SDAG-NEXT:    v_readlane_b32 s43, v31, 7
-; CHECK-SDAG-NEXT:    v_readlane_b32 s42, v31, 6
-; CHECK-SDAG-NEXT:    v_readlane_b32 s41, v31, 5
-; CHECK-SDAG-NEXT:    v_readlane_b32 s40, v31, 4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s67, v31, 15
+; CHECK-SDAG-NEXT:    v_readlane_b32 s66, v31, 14
+; CHECK-SDAG-NEXT:    v_readlane_b32 s65, v31, 13
+; CHECK-SDAG-NEXT:    v_readlane_b32 s64, v31, 12
+; CHECK-SDAG-NEXT:    v_readlane_b32 s55, v31, 11
+; CHECK-SDAG-NEXT:    v_readlane_b32 s54, v31, 10
+; CHECK-SDAG-NEXT:    v_readlane_b32 s53, v31, 9
+; CHECK-SDAG-NEXT:    v_readlane_b32 s52, v31, 8
+; CHECK-SDAG-NEXT:    v_readlane_b32 s51, v31, 7
+; CHECK-SDAG-NEXT:    v_readlane_b32 s50, v31, 6
+; CHECK-SDAG-NEXT:    v_readlane_b32 s49, v31, 5
+; CHECK-SDAG-NEXT:    v_readlane_b32 s48, v31, 4
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s39, v31, 3
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s38, v31, 2
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s37, v31, 1
@@ -1449,45 +1353,21 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
 ; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s37, 1
 ; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s38, 2
 ; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s39, 3
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s40, 4
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s41, 5
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s42, 6
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s43, 7
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s44, 8
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s45, 9
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s46, 10
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s47, 11
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s48, 12
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s49, 13
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s50, 14
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s51, 15
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s52, 16
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s53, 17
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s54, 18
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s55, 19
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s56, 20
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s57, 21
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s58, 22
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s59, 23
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s60, 24
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s61, 25
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s62, 26
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s63, 27
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s64, 28
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s65, 29
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s66, 30
-; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s67, 31
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s48, 4
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s49, 5
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s50, 6
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s51, 7
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s52, 8
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s53, 9
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s54, 10
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s55, 11
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s64, 12
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s65, 13
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s66, 14
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s67, 15
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s37, v3
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s38, v4
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s39, v5
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s40, v6
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s41, v7
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s42, v8
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s43, v9
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s44, v10
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s45, v11
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s46, v12
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s47, v13
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s48, v14
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s49, v15
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s50, v16
@@ -1496,6 +1376,15 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s53, v19
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s54, v20
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s55, v21
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s64, v30
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s40, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s41, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s42, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s43, v9
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s44, v10
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s45, v11
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s46, v12
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s47, v13
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s56, v22
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s57, v23
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s58, v24
@@ -1504,7 +1393,6 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s61, v27
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s62, v28
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s63, v29
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s64, v30
 ; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s65, v0
 ; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1514,34 +1402,18 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; use s[36:67]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_readlane_b32 s67, v31, 31
-; CHECK-GISEL-NEXT:    v_readlane_b32 s66, v31, 30
-; CHECK-GISEL-NEXT:    v_readlane_b32 s65, v31, 29
-; CHECK-GISEL-NEXT:    v_readlane_b32 s64, v31, 28
-; CHECK-GISEL-NEXT:    v_readlane_b32 s63, v31, 27
-; CHECK-GISEL-NEXT:    v_readlane_b32 s62, v31, 26
-; CHECK-GISEL-NEXT:    v_readlane_b32 s61, v31, 25
-; CHECK-GISEL-NEXT:    v_readlane_b32 s60, v31, 24
-; CHECK-GISEL-NEXT:    v_readlane_b32 s59, v31, 23
-; CHECK-GISEL-NEXT:    v_readlane_b32 s58, v31, 22
-; CHECK-GISEL-NEXT:    v_readlane_b32 s57, v31, 21
-; CHECK-GISEL-NEXT:    v_readlane_b32 s56, v31, 20
-; CHECK-GISEL-NEXT:    v_readlane_b32 s55, v31, 19
-; CHECK-GISEL-NEXT:    v_readlane_b32 s54, v31, 18
-; CHECK-GISEL-NEXT:    v_readlane_b32 s53, v31, 17
-; CHECK-GISEL-NEXT:    v_readlane_b32 s52, v31, 16
-; CHECK-GISEL-NEXT:    v_readlane_b32 s51, v31, 15
-; CHECK-GISEL-NEXT:    v_readlane_b32 s50, v31, 14
-; CHECK-GISEL-NEXT:    v_readlane_b32 s49, v31, 13
-; CHECK-GISEL-NEXT:    v_readlane_b32 s48, v31, 12
-; CHECK-GISEL-NEXT:    v_readlane_b32 s47, v31, 11
-; CHECK-GISEL-NEXT:    v_readlane_b32 s46, v31, 10
-; CHECK-GISEL-NEXT:    v_readlane_b32 s45, v31, 9
-; CHECK-GISEL-NEXT:    v_readlane_b32 s44, v31, 8
-; CHECK-GISEL-NEXT:    v_readlane_b32 s43, v31, 7
-; CHECK-GISEL-NEXT:    v_readlane_b32 s42, v31, 6
-; CHECK-GISEL-NEXT:    v_readlane_b32 s41, v31, 5
-; CHECK-GISEL-NEXT:    v_readlane_b32 s40, v31, 4
+; CHECK-GISEL-NEXT:    v_readlane_b32 s67, v31, 15
+; CHECK-GISEL-NEXT:    v_readlane_b32 s66, v31, 14
+; CHECK-GISEL-NEXT:    v_readlane_b32 s65, v31, 13
+; CHECK-GISEL-NEXT:    v_readlane_b32 s64, v31, 12
+; CHECK-GISEL-NEXT:    v_readlane_b32 s55, v31, 11
+; CHECK-GISEL-NEXT:    v_readlane_b32 s54, v31, 10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s53, v31, 9
+; CHECK-GISEL-NEXT:    v_readlane_b32 s52, v31, 8
+; CHECK-GISEL-NEXT:    v_readlane_b32 s51, v31, 7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s50, v31, 6
+; CHECK-GISEL-NEXT:    v_readlane_b32 s49, v31, 5
+; CHECK-GISEL-NEXT:    v_readlane_b32 s48, v31, 4
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s39, v31, 3
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s38, v31, 2
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s37, v31, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index 584dd2700c419..8b1ba393c8de8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -1727,14 +1727,9 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX7-LABEL: v_maximum_v16f32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
 ; GFX7-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX7-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
 ; GFX7-NEXT:    v_max_f32_e32 v2, v2, v18
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
@@ -1743,7 +1738,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX7-NEXT:    v_max_f32_e32 v19, v0, v16
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[28:29], v0, v16
 ; GFX7-NEXT:    v_max_f32_e32 v16, v14, v30
-; GFX7-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[40:41], v14, v30
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
 ; GFX7-NEXT:    v_max_f32_e32 v4, v4, v20
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1765,7 +1760,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[26:27], v13, v29
 ; GFX7-NEXT:    v_max_f32_e32 v13, v13, v29
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[40:41]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v18, v19, s[28:29]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s[6:7]
@@ -1779,29 +1774,18 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, v18, v11, s[22:23]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v12, v18, v12, s[24:25]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v13, v18, v13, s[26:27]
-; GFX7-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v31, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_max_f32_e32 v16, v15, v17
 ; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
 ; GFX7-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_maximum_v16f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
@@ -1810,7 +1794,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX8-NEXT:    v_max_f32_e32 v19, v0, v16
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[28:29], v0, v16
 ; GFX8-NEXT:    v_max_f32_e32 v16, v14, v30
-; GFX8-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[40:41], v14, v30
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1832,7 +1816,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[26:27], v13, v29
 ; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[40:41]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v18, v19, s[28:29]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s[6:7]
@@ -1846,29 +1830,18 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v18, v11, s[22:23]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, v18, v12, s[24:25]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v18, v13, s[26:27]
-; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f32_e32 v16, v15, v17
 ; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximum_v16f32:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
 ; GFX900-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GFX900-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX900-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
 ; GFX900-NEXT:    v_max_f32_e32 v2, v2, v18
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
@@ -1877,7 +1850,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX900-NEXT:    v_max_f32_e32 v19, v0, v16
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[28:29], v0, v16
 ; GFX900-NEXT:    v_max_f32_e32 v16, v14, v30
-; GFX900-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[40:41], v14, v30
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
 ; GFX900-NEXT:    v_max_f32_e32 v4, v4, v20
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1899,7 +1872,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[26:27], v13, v29
 ; GFX900-NEXT:    v_max_f32_e32 v13, v13, v29
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX900-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[40:41]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v0, v18, v19, s[28:29]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s[6:7]
@@ -1913,16 +1886,10 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX900-NEXT:    v_cndmask_b32_e64 v11, v18, v11, s[22:23]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v12, v18, v12, s[24:25]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v13, v18, v13, s[26:27]
-; GFX900-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v31, 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_max_f32_e32 v16, v15, v17
 ; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_maximum_v16f32:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index e354ec6fb3dd7..3344c73f9eb6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -2008,15 +2008,8 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX7-LABEL: v_maximum_v16f64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_writelane_b32 v34, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v34, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v34, s34, 2
-; GFX7-NEXT:    v_writelane_b32 v34, s35, 3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
 ; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[31:32]
@@ -2102,14 +2095,14 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; GFX7-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
+; GFX7-NEXT:    v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32]
 ; GFX7-NEXT:    v_max_f64 v[28:29], v[28:29], v[31:32]
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX7-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
+; GFX7-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[40:41]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
+; GFX7-NEXT:    v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33]
 ; GFX7-NEXT:    v_max_f64 v[30:31], v[30:31], v[32:33]
 ; GFX7-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
@@ -2126,31 +2119,16 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX7-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
-; GFX7-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
-; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
-; GFX7-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
-; GFX7-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX7-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v34, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[40:41]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[42:43]
+; GFX7-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[42:43]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_maximum_v16f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
-; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[31:32]
@@ -2236,14 +2214,14 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
+; GFX8-NEXT:    v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32]
 ; GFX8-NEXT:    v_max_f64 v[28:29], v[28:29], v[31:32]
 ; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
 ; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX8-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[40:41]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
+; GFX8-NEXT:    v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33]
 ; GFX8-NEXT:    v_max_f64 v[30:31], v[30:31], v[32:33]
 ; GFX8-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
@@ -2260,31 +2238,16 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX8-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
-; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
-; GFX8-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
-; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
-; GFX8-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX8-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v34, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[40:41]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[42:43]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximum_v16f64:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX900-NEXT:    v_writelane_b32 v34, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v34, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v34, s34, 2
-; GFX900-NEXT:    v_writelane_b32 v34, s35, 3
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
 ; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[31:32]
@@ -2370,14 +2333,14 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; GFX900-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32]
 ; GFX900-NEXT:    v_max_f64 v[28:29], v[28:29], v[31:32]
 ; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
 ; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX900-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
+; GFX900-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[40:41]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33]
 ; GFX900-NEXT:    v_max_f64 v[30:31], v[30:31], v[32:33]
 ; GFX900-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
@@ -2394,17 +2357,9 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
-; GFX900-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
-; GFX900-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
-; GFX900-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
-; GFX900-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX900-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v34, 0
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[40:41]
+; GFX900-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[42:43]
+; GFX900-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[42:43]
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_maximum_v16f64:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 9962433134073..7b2998cbd242f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -1727,14 +1727,9 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX7-LABEL: v_minimum_v16f32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
 ; GFX7-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX7-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
 ; GFX7-NEXT:    v_min_f32_e32 v2, v2, v18
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
@@ -1743,7 +1738,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX7-NEXT:    v_min_f32_e32 v19, v0, v16
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[28:29], v0, v16
 ; GFX7-NEXT:    v_min_f32_e32 v16, v14, v30
-; GFX7-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[40:41], v14, v30
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
 ; GFX7-NEXT:    v_min_f32_e32 v4, v4, v20
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1765,7 +1760,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX7-NEXT:    v_cmp_o_f32_e64 s[26:27], v13, v29
 ; GFX7-NEXT:    v_min_f32_e32 v13, v13, v29
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[40:41]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v18, v19, s[28:29]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s[6:7]
@@ -1779,29 +1774,18 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, v18, v11, s[22:23]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v12, v18, v12, s[24:25]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v13, v18, v13, s[26:27]
-; GFX7-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v31, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_min_f32_e32 v16, v15, v17
 ; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
 ; GFX7-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_minimum_v16f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
@@ -1810,7 +1794,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX8-NEXT:    v_min_f32_e32 v19, v0, v16
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[28:29], v0, v16
 ; GFX8-NEXT:    v_min_f32_e32 v16, v14, v30
-; GFX8-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[40:41], v14, v30
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1832,7 +1816,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX8-NEXT:    v_cmp_o_f32_e64 s[26:27], v13, v29
 ; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[40:41]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v18, v19, s[28:29]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s[6:7]
@@ -1846,29 +1830,18 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v18, v11, s[22:23]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, v18, v12, s[24:25]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v18, v13, s[26:27]
-; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_min_f32_e32 v16, v15, v17
 ; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimum_v16f32:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
 ; GFX900-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GFX900-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX900-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
 ; GFX900-NEXT:    v_min_f32_e32 v2, v2, v18
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
@@ -1877,7 +1850,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX900-NEXT:    v_min_f32_e32 v19, v0, v16
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[28:29], v0, v16
 ; GFX900-NEXT:    v_min_f32_e32 v16, v14, v30
-; GFX900-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[40:41], v14, v30
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
 ; GFX900-NEXT:    v_min_f32_e32 v4, v4, v20
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1899,7 +1872,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX900-NEXT:    v_cmp_o_f32_e64 s[26:27], v13, v29
 ; GFX900-NEXT:    v_min_f32_e32 v13, v13, v29
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX900-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[40:41]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v0, v18, v19, s[28:29]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s[6:7]
@@ -1913,16 +1886,10 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX900-NEXT:    v_cndmask_b32_e64 v11, v18, v11, s[22:23]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v12, v18, v12, s[24:25]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v13, v18, v13, s[26:27]
-; GFX900-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v31, 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_min_f32_e32 v16, v15, v17
 ; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_minimum_v16f32:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 71fdd691a1512..1d1673315f6ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -2008,15 +2008,8 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX7-LABEL: v_minimum_v16f64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_writelane_b32 v34, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v34, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v34, s34, 2
-; GFX7-NEXT:    v_writelane_b32 v34, s35, 3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
 ; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[31:32]
@@ -2102,14 +2095,14 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; GFX7-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
+; GFX7-NEXT:    v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32]
 ; GFX7-NEXT:    v_min_f64 v[28:29], v[28:29], v[31:32]
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX7-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
+; GFX7-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[40:41]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
+; GFX7-NEXT:    v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33]
 ; GFX7-NEXT:    v_min_f64 v[30:31], v[30:31], v[32:33]
 ; GFX7-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
@@ -2126,31 +2119,16 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX7-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
-; GFX7-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
-; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
-; GFX7-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
-; GFX7-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX7-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v34, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[40:41]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[42:43]
+; GFX7-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[42:43]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_minimum_v16f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
-; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[31:32]
@@ -2236,14 +2214,14 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
+; GFX8-NEXT:    v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32]
 ; GFX8-NEXT:    v_min_f64 v[28:29], v[28:29], v[31:32]
 ; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
 ; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX8-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[40:41]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
+; GFX8-NEXT:    v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33]
 ; GFX8-NEXT:    v_min_f64 v[30:31], v[30:31], v[32:33]
 ; GFX8-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
@@ -2260,31 +2238,16 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX8-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
-; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
-; GFX8-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
-; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
-; GFX8-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX8-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v34, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[40:41]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[42:43]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimum_v16f64:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX900-NEXT:    v_writelane_b32 v34, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v34, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v34, s34, 2
-; GFX900-NEXT:    v_writelane_b32 v34, s35, 3
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
 ; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[31:32]
@@ -2370,14 +2333,14 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; GFX900-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32]
 ; GFX900-NEXT:    v_min_f64 v[28:29], v[28:29], v[31:32]
 ; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
 ; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX900-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
+; GFX900-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[40:41]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33]
 ; GFX900-NEXT:    v_min_f64 v[30:31], v[30:31], v[32:33]
 ; GFX900-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
@@ -2394,17 +2357,9 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
 ; GFX900-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
-; GFX900-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
-; GFX900-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
-; GFX900-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
-; GFX900-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX900-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v34, 0
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[40:41]
+; GFX900-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[42:43]
+; GFX900-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[42:43]
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_minimum_v16f64:

diff  --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index acb706cee04d0..4fb6a0114b499 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -43,28 +43,28 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_addc_u32 s13, s13, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; CHECK-NEXT:    s_load_dwordx8 s[48:55], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dwordx8 s[64:71], s[8:9], 0x0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v0
-; CHECK-NEXT:    s_add_u32 s44, s34, 40
+; CHECK-NEXT:    s_add_u32 s52, s34, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_mov_b32 s33, s16
-; CHECK-NEXT:    s_addc_u32 s45, s35, 0
-; CHECK-NEXT:    s_mov_b32 s43, s14
+; CHECK-NEXT:    s_addc_u32 s53, s35, 0
+; CHECK-NEXT:    s_mov_b32 s51, s14
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z13get_global_idj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z13get_global_idj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[44:45]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; CHECK-NEXT:    s_mov_b32 s12, s14
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s14, s33
-; CHECK-NEXT:    s_mov_b32 s42, s15
+; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v45, 0
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v43, v0
@@ -73,12 +73,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z12get_local_idj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z12get_local_idj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[44:45]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v0
@@ -87,12 +87,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[44:45]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    ds_write_b32 v45, v45 offset:15360
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -102,22 +102,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z3minjj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3minjj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffffc, v0
 ; CHECK-NEXT:    v_and_b32_e32 v1, 28, v1
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[44:45]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    global_load_dword v0, v0, s[52:53]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    global_load_dword v0, v0, s[68:69]
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_bfe_u32 v0, v0, v1, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 12
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v0
-; CHECK-NEXT:    s_mov_b32 s44, exec_lo
+; CHECK-NEXT:    s_mov_b32 s52, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u32_e32 0, v42
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_25
 ; CHECK-NEXT:  ; %bb.1: ; %.preheader5
@@ -136,7 +136,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    v_add_nc_u32_e32 v45, -1, v42
-; CHECK-NEXT:    s_mov_b32 s45, 0
+; CHECK-NEXT:    s_mov_b32 s53, 0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v45
 ; CHECK-NEXT:    s_and_b32 exec_lo, exec_lo, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_25
@@ -144,46 +144,46 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v43, 10, v43
 ; CHECK-NEXT:    v_add_nc_u32_e32 v46, 0x3c05, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v47, 0
-; CHECK-NEXT:    s_mov_b32 s47, 0
+; CHECK-NEXT:    s_mov_b32 s55, 0
 ; CHECK-NEXT:  .LBB0_5: ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB0_8 Depth 2
 ; CHECK-NEXT:    ; Child Loop BB0_20 Depth 2
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, s47, v44
-; CHECK-NEXT:    s_lshl_b32 s4, s47, 5
-; CHECK-NEXT:    s_add_i32 s46, s47, 1
-; CHECK-NEXT:    s_add_i32 s5, s47, 5
-; CHECK-NEXT:    v_or3_b32 v57, s4, v43, s46
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s55, v44
+; CHECK-NEXT:    s_lshl_b32 s4, s55, 5
+; CHECK-NEXT:    s_add_i32 s54, s55, 1
+; CHECK-NEXT:    s_add_i32 s5, s55, 5
+; CHECK-NEXT:    v_or3_b32 v57, s4, v43, s54
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    ds_read_u8 v56, v0
-; CHECK-NEXT:    v_mov_b32_e32 v58, s46
-; CHECK-NEXT:    s_mov_b32 s52, exec_lo
+; CHECK-NEXT:    v_mov_b32_e32 v58, s54
+; CHECK-NEXT:    s_mov_b32 s68, exec_lo
 ; CHECK-NEXT:    v_cmpx_lt_u32_e64 s5, v42
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_17
 ; CHECK-NEXT:  ; %bb.6: ; %.preheader2
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:    s_mov_b32 s53, 0
-; CHECK-NEXT:    s_mov_b32 s56, 0
+; CHECK-NEXT:    s_mov_b32 s69, 0
+; CHECK-NEXT:    s_mov_b32 s80, 0
 ; CHECK-NEXT:    s_branch .LBB0_8
 ; CHECK-NEXT:  .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
-; CHECK-NEXT:    s_add_i32 s56, s56, 4
-; CHECK-NEXT:    s_add_i32 s4, s47, s56
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, s56, v57
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s81
+; CHECK-NEXT:    s_add_i32 s80, s80, 4
+; CHECK-NEXT:    s_add_i32 s4, s55, s80
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s80, v57
 ; CHECK-NEXT:    s_add_i32 s5, s4, 5
 ; CHECK-NEXT:    s_add_i32 s4, s4, 1
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
 ; CHECK-NEXT:    v_mov_b32_e32 v58, s4
-; CHECK-NEXT:    s_or_b32 s53, vcc_lo, s53
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT:    s_or_b32 s69, vcc_lo, s69
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s69
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_16
 ; CHECK-NEXT:  .LBB0_8: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    v_add_nc_u32_e32 v59, s56, v46
-; CHECK-NEXT:    v_add_nc_u32_e32 v58, s56, v57
+; CHECK-NEXT:    v_add_nc_u32_e32 v59, s80, v46
+; CHECK-NEXT:    v_add_nc_u32_e32 v58, s80, v57
 ; CHECK-NEXT:    ds_read_u8 v0, v59
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
+; CHECK-NEXT:    s_and_saveexec_b32 s81, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -193,22 +193,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v58
 ; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s81
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
+; CHECK-NEXT:    s_and_saveexec_b32 s81, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_12
 ; CHECK-NEXT:  ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -218,11 +218,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    v_add_nc_u32_e32 v60, 1, v58
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
@@ -230,11 +230,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v60
 ; CHECK-NEXT:  .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s81
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
+; CHECK-NEXT:    s_and_saveexec_b32 s81, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_14
 ; CHECK-NEXT:  ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -244,11 +244,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    v_add_nc_u32_e32 v60, 2, v58
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
@@ -256,11 +256,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v60
 ; CHECK-NEXT:  .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s81
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
+; CHECK-NEXT:    s_and_saveexec_b32 s81, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_7
 ; CHECK-NEXT:  ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -270,11 +270,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    v_add_nc_u32_e32 v58, 3, v58
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
@@ -284,27 +284,27 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_branch .LBB0_7
 ; CHECK-NEXT:  .LBB0_16: ; %Flow45
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s69
 ; CHECK-NEXT:    v_mov_b32_e32 v57, v0
 ; CHECK-NEXT:  .LBB0_17: ; %Flow46
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s52
-; CHECK-NEXT:    s_mov_b32 s47, exec_lo
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s68
+; CHECK-NEXT:    s_mov_b32 s55, exec_lo
 ; CHECK-NEXT:    v_cmpx_lt_u32_e64 v58, v42
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_23
 ; CHECK-NEXT:  ; %bb.18: ; %.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:    s_mov_b32 s52, 0
+; CHECK-NEXT:    s_mov_b32 s68, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
 ; CHECK-NEXT:    s_branch .LBB0_20
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s69
 ; CHECK-NEXT:    v_add_nc_u32_e32 v58, 1, v58
 ; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v58, v42
-; CHECK-NEXT:    s_or_b32 s52, vcc_lo, s52
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s52
+; CHECK-NEXT:    s_or_b32 s68, vcc_lo, s68
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s68
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_22
 ; CHECK-NEXT:  .LBB0_20: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -312,7 +312,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ds_read_u8 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT:    s_and_saveexec_b32 s53, s4
+; CHECK-NEXT:    s_and_saveexec_b32 s69, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_19
 ; CHECK-NEXT:  ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -322,11 +322,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -336,22 +336,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  .LBB0_22: ; %Flow43
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s52
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s68
 ; CHECK-NEXT:  .LBB0_23: ; %Flow44
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s47
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
 ; CHECK-NEXT:  ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s46, v45
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s54, v45
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s4, 59, v47
 ; CHECK-NEXT:    v_add_nc_u32_e32 v46, 1, v46
-; CHECK-NEXT:    s_mov_b32 s47, s46
+; CHECK-NEXT:    s_mov_b32 s55, s54
 ; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_and_b32 s4, exec_lo, s4
-; CHECK-NEXT:    s_or_b32 s45, s4, s45
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s45
+; CHECK-NEXT:    s_or_b32 s53, s4, s53
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s53
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_5
 ; CHECK-NEXT:  .LBB0_25: ; %Flow51
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s44
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s52
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -359,11 +359,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
@@ -373,10 +373,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_cmpx_gt_u32_e64 v47, v41
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_33
 ; CHECK-NEXT:  ; %bb.26:
-; CHECK-NEXT:    s_mov_b32 s44, 0
+; CHECK-NEXT:    s_mov_b32 s52, 0
 ; CHECK-NEXT:    s_branch .LBB0_28
 ; CHECK-NEXT:  .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s45
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
@@ -384,21 +384,21 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z14get_local_sizej at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z14get_local_sizej at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_add_co_u32 v41, vcc_lo, v0, v41
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v41
-; CHECK-NEXT:    s_or_b32 s44, vcc_lo, s44
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
+; CHECK-NEXT:    s_or_b32 s52, vcc_lo, s52
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s52
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_33
 ; CHECK-NEXT:  .LBB0_28: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v41
-; CHECK-NEXT:    s_mov_b32 s45, exec_lo
+; CHECK-NEXT:    s_mov_b32 s53, exec_lo
 ; CHECK-NEXT:    ds_read_b32 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v63, 10, v0
@@ -407,8 +407,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v1, 0x180, v63
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 5, v62
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 5, v72
-; CHECK-NEXT:    v_add_co_u32 v2, s4, s48, v1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, s49, 0, s4
+; CHECK-NEXT:    v_add_co_u32 v2, s4, s64, v1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, s65, 0, s4
 ; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
 ; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -445,8 +445,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_or3_b32 v2, v3, v2, v4
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    v_or3_b32 v73, v2, v0, v1
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v73
@@ -454,11 +454,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffc, v0
 ; CHECK-NEXT:    v_lshlrev_b32_e64 v44, v1, 1
 ; CHECK-NEXT:    v_and_b32_e32 v74, 28, v1
-; CHECK-NEXT:    v_add_co_u32 v42, s4, s54, v0
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v43, null, s55, 0, s4
+; CHECK-NEXT:    v_add_co_u32 v42, s4, s70, v0
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v43, null, s71, 0, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v44
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v42
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v43
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_bfe_u32 v0, v0, v74, 4
@@ -469,7 +469,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
 ; CHECK-NEXT:    v_xor_b32_e32 v4, v60, v58
 ; CHECK-NEXT:    v_lshrrev_b64 v[2:3], 16, v[56:57]
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], null, 0x180, v73, s[50:51]
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], null, 0x180, v73, s[66:67]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v10, 5, v0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v8, 6, v72
@@ -503,11 +503,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    s_branch .LBB0_27
@@ -792,28 +792,28 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_addc_u32 s13, s13, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; CHECK-NEXT:    s_load_dwordx2 s[46:47], s[8:9], 0x10
+; CHECK-NEXT:    s_load_dwordx2 s[54:55], s[8:9], 0x10
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v0
-; CHECK-NEXT:    s_add_u32 s44, s38, 40
+; CHECK-NEXT:    s_add_u32 s52, s38, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_mov_b32 s33, s16
-; CHECK-NEXT:    s_addc_u32 s45, s39, 0
-; CHECK-NEXT:    s_mov_b32 s43, s14
+; CHECK-NEXT:    s_addc_u32 s53, s39, 0
+; CHECK-NEXT:    s_mov_b32 s51, s14
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z13get_global_idj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z13get_global_idj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[44:45]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; CHECK-NEXT:    s_mov_b32 s12, s14
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s14, s33
-; CHECK-NEXT:    s_mov_b32 s42, s15
+; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v43, 0
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v0
@@ -822,12 +822,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z12get_local_idj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z12get_local_idj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[44:45]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mul_lo_u32 v46, v0, 14
@@ -836,12 +836,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[44:45]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    ds_write_b32 v43, v43 offset:15360
 ; CHECK-NEXT:    v_add_nc_u32_e32 v44, 0x3c04, v46
@@ -852,15 +852,15 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z3minjj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3minjj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffffc, v0
 ; CHECK-NEXT:    v_and_b32_e32 v1, 28, v1
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[44:45]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    global_load_dword v0, v0, s[46:47]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    global_load_dword v0, v0, s[54:55]
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_bfe_u32 v0, v0, v1, 4
@@ -868,7 +868,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 10, v42
-; CHECK-NEXT:    s_mov_b32 s44, 0
+; CHECK-NEXT:    s_mov_b32 s52, 0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    ds_write_b8 v46, v43 offset:15364
 ; CHECK-NEXT:    v_add_nc_u32_e32 v45, -1, v41
@@ -878,12 +878,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    ; Child Loop BB1_8 Depth 2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s4, v44
 ; CHECK-NEXT:    s_lshl_b32 s5, s4, 5
-; CHECK-NEXT:    s_add_i32 s45, s4, 1
+; CHECK-NEXT:    s_add_i32 s53, s4, 1
 ; CHECK-NEXT:    s_add_i32 s6, s4, 5
-; CHECK-NEXT:    v_or3_b32 v47, s5, v42, s45
+; CHECK-NEXT:    v_or3_b32 v47, s5, v42, s53
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    ds_read_u8 v46, v0
-; CHECK-NEXT:    v_mov_b32_e32 v56, s45
+; CHECK-NEXT:    v_mov_b32_e32 v56, s53
 ; CHECK-NEXT:    s_mov_b32 s5, exec_lo
 ; CHECK-NEXT:    v_cmpx_lt_u32_e64 s6, v41
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_5
@@ -912,23 +912,23 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:  .LBB1_5: ; %Flow4
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; CHECK-NEXT:    s_mov_b32 s46, exec_lo
+; CHECK-NEXT:    s_mov_b32 s54, exec_lo
 ; CHECK-NEXT:    v_cmpx_lt_u32_e64 v56, v41
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_11
 ; CHECK-NEXT:  ; %bb.6: ; %.103.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    s_mov_b32 s47, 0
+; CHECK-NEXT:    s_mov_b32 s55, 0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
 ; CHECK-NEXT:    s_branch .LBB1_8
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB1_7: ; %.114
 ; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s64
 ; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
-; CHECK-NEXT:    s_or_b32 s47, vcc_lo, s47
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s47
+; CHECK-NEXT:    s_or_b32 s55, vcc_lo, s55
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s55
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_10
 ; CHECK-NEXT:  .LBB1_8: ; %.103
 ; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
@@ -937,7 +937,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    ds_read_u8 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT:    s_and_saveexec_b32 s48, s4
+; CHECK-NEXT:    s_and_saveexec_b32 s64, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_7
 ; CHECK-NEXT:  ; %bb.9: ; %.110
 ; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
@@ -948,11 +948,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    v_add_nc_u32_e32 v43, 1, v43
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -962,22 +962,22 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:  .LBB1_10: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s47
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
 ; CHECK-NEXT:  .LBB1_11: ; %Flow2
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s46
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s54
 ; CHECK-NEXT:  ; %bb.12: ; %.32
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s45, v45
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s53, v45
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s4, 59, v43
 ; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_and_b32 s4, exec_lo, s4
-; CHECK-NEXT:    s_or_b32 s44, s4, s44
-; CHECK-NEXT:    s_mov_b32 s4, s45
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
+; CHECK-NEXT:    s_or_b32 s52, s4, s52
+; CHECK-NEXT:    s_mov_b32 s4, s53
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s52
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
 ; CHECK-NEXT:  ; %bb.13: ; %.119
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s44
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s52
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_add_u32 s8, s38, 40
@@ -985,11 +985,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s43
-; CHECK-NEXT:    s_mov_b32 s13, s42
+; CHECK-NEXT:    s_mov_b32 s12, s51
+; CHECK-NEXT:    s_mov_b32 s13, s50
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index b4682dfb8a26d..4ca00f2daf97a 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -12,13 +12,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
 ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
 ; GFX10_1:       ; %bb.0:
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
-; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_1-NEXT:    ;;#ASMSTART
@@ -30,24 +24,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
-; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
 ; GFX10_3:       ; %bb.0:
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
-; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_3-NEXT:    ;;#ASMSTART
@@ -59,23 +41,13 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
-; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
-; GFX11-NEXT:    scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX11-NEXT:    s_addc_u32 s0, s32, 0x4040
@@ -89,12 +61,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
-; GFX11-NEXT:    scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
@@ -104,13 +70,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s0, s32, 0x4000
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -124,50 +84,30 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
-; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
-; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX8-NEXT:    s_movk_i32 s59, 0x4040
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s59, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
-; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -175,47 +115,29 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
-; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
-; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
-; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
-; GFX942-NEXT:    scratch_store_dword off, v1, s2 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NEXT:    s_and_b64 s[0:1], 0, exec
 ; GFX942-NEXT:    s_addc_u32 s0, s32, 0x4040
 ; GFX942-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX942-NEXT:    s_bitset0_b32 s0, 0
-; GFX942-NEXT:    v_writelane_b32 v1, s59, 0
-; GFX942-NEXT:    s_mov_b32 s59, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s59, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s59, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
-; GFX942-NEXT:    scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
@@ -230,12 +152,6 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
 ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_dead_scc:
 ; GFX10_1:       ; %bb.0:
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
-; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_lshr_b32 s59, s32, 5
 ; GFX10_1-NEXT:    s_addk_i32 s59, 0x4040
@@ -246,23 +162,11 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
-; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_dead_scc:
 ; GFX10_3:       ; %bb.0:
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
-; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_lshr_b32 s59, s32, 5
 ; GFX10_3-NEXT:    s_addk_i32 s59, 0x4040
@@ -273,22 +177,11 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
-; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: scalar_mov_materializes_frame_index_dead_scc:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
-; GFX11-NEXT:    scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
@@ -300,12 +193,6 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
-; GFX11-NEXT:    scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: scalar_mov_materializes_frame_index_dead_scc:
@@ -315,105 +202,62 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s59, s0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s59, s0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
-; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
-; GFX8-NEXT:    s_lshr_b32 s59, s32, 6
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT:    s_addk_i32 s59, 0x4040
+; GFX8-NEXT:    s_lshr_b32 s59, s32, 6
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_addk_i32 s59, 0x4040
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s59
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
-; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: scalar_mov_materializes_frame_index_dead_scc:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
-; GFX900-NEXT:    s_lshr_b32 s59, s32, 6
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT:    s_addk_i32 s59, 0x4040
+; GFX900-NEXT:    s_lshr_b32 s59, s32, 6
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_addk_i32 s59, 0x4040
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s59
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
-; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: scalar_mov_materializes_frame_index_dead_scc:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
-; GFX942-NEXT:    scratch_store_dword off, v1, s2 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NEXT:    s_add_i32 s0, s32, 0x4040
-; GFX942-NEXT:    v_writelane_b32 v1, s59, 0
-; GFX942-NEXT:    s_mov_b32 s59, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s59, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s59
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
-; GFX942-NEXT:    scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
@@ -428,14 +272,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_1-NEXT:    s_mov_b32 s5, s33
 ; GFX10_1-NEXT:    s_mov_b32 s33, s32
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80880
-; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    s_mov_b32 s32, s33
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
@@ -443,19 +281,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_1-NEXT:    s_mov_b32 s33, s5
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
 ; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80880
-; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_mov_b32 s33, s5
-; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -463,13 +294,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_3-NEXT:    s_mov_b32 s5, s33
 ; GFX10_3-NEXT:    s_mov_b32 s33, s32
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80880
-; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    s_mov_b32 s32, s33
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
@@ -477,18 +303,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_3-NEXT:    s_mov_b32 s33, s5
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
 ; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80880
-; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_mov_b32 s33, s5
-; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -496,13 +316,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s1, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s2, s33, 0x4044
-; GFX11-NEXT:    scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0x4080
 ; GFX11-NEXT:    s_add_i32 s0, s33, 64
-; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX11-NEXT:    s_addc_u32 s0, s33, 0x4040
@@ -511,18 +327,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX11-NEXT:    s_bitset0_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    s_mov_b32 s33, s1
 ; GFX11-NEXT:    s_mov_b32 s59, s0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s2, s33, 0x4044
-; GFX11-NEXT:    scratch_load_b32 v1, off, s2 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_mov_b32 s33, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -534,13 +343,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s1, s33
 ; GFX12-NEXT:    s_mov_b32 s33, s32
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_co_ci_u32 s0, s33, 0x4000
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s33
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -554,14 +359,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
 ; GFX12-NEXT:    s_mov_b32 s32, s33
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_mov_b32 s33, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -570,33 +369,22 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s7, s33, 0x101100
-; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
-; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX8-NEXT:    s_movk_i32 s59, 0x4040
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s59, v0
 ; GFX8-NEXT:    s_add_i32 s32, s32, 0x102000
-; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s59, v0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s7, s33, 0x101100
-; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_mov_b32 s33, s6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -604,32 +392,21 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_mov_b32 s6, s33
 ; GFX900-NEXT:    s_mov_b32 s33, s32
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s7, s33, 0x101100
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
 ; GFX900-NEXT:    s_add_i32 s32, s32, 0x102000
-; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
-; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s7, s33, 0x101100
-; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    s_mov_b32 s33, s6
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -637,10 +414,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b32 s2, s33
 ; GFX942-NEXT:    s_mov_b32 s33, s32
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s3, s33, 0x4044
-; GFX942-NEXT:    scratch_store_dword off, v1, s3 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_addk_i32 s32, 0x4080
 ; GFX942-NEXT:    s_add_i32 s0, s33, 64
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
@@ -648,22 +421,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX942-NEXT:    s_addc_u32 s0, s33, 0x4040
 ; GFX942-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX942-NEXT:    s_bitset0_b32 s0, 0
-; GFX942-NEXT:    v_writelane_b32 v1, s59, 0
-; GFX942-NEXT:    s_mov_b32 s59, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s59, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s59, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s59, v1, 0
 ; GFX942-NEXT:    s_mov_b32 s32, s33
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s3, s33, 0x4044
-; GFX942-NEXT:    scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_mov_b32 s33, s2
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
@@ -676,75 +442,39 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
 ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
 ; GFX10_1:       ; %bb.0:
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
-; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
-; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
-; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
-; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
 ; GFX10_3:       ; %bb.0:
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
-; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
-; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
-; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
-; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
-; GFX11-NEXT:    scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX11-NEXT:    s_addc_u32 s0, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX11-NEXT:    s_bitset0_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s59, s0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
-; GFX11-NEXT:    scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
@@ -754,94 +484,50 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX12-NEXT:    s_mov_b32 s59, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    s_mov_b32 s59, s32
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX8-NEXT:    s_mov_b32 s59, 64
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s59, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s59, v0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
-; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
-; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
-; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
-; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX900-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
-; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x4040
-; GFX942-NEXT:    scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_and_b64 s[0:1], 0, exec
 ; GFX942-NEXT:    s_addc_u32 s0, s32, 64
 ; GFX942-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX942-NEXT:    s_bitset0_b32 s0, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX942-NEXT:    s_mov_b32 s59, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s59, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x4040
-; GFX942-NEXT:    scratch_load_dword v0, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0)
@@ -852,67 +538,32 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
 ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
 ; GFX10_1:       ; %bb.0:
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
-; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX10_1-NEXT:    s_lshr_b32 s59, s32, 5
 ; GFX10_1-NEXT:    s_add_i32 s59, s59, 64
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
-; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
 ; GFX10_3:       ; %bb.0:
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
-; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX10_3-NEXT:    s_lshr_b32 s59, s32, 5
 ; GFX10_3-NEXT:    s_add_i32 s59, s59, 64
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
-; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
-; GFX11-NEXT:    scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s59, s0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
-; GFX11-NEXT:    scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
@@ -922,85 +573,41 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX12-NEXT:    s_mov_b32 s59, s32
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX8-NEXT:    s_lshr_b32 s59, s32, 6
 ; GFX8-NEXT:    s_add_i32 s59, s59, 64
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s59
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
-; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
-; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX900-NEXT:    s_lshr_b32 s59, s32, 6
 ; GFX900-NEXT:    s_add_i32 s59, s59, 64
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s59
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
-; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x4040
-; GFX942-NEXT:    scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
-; GFX942-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX942-NEXT:    s_mov_b32 s59, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s59
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x4040
-; GFX942-NEXT:    scratch_load_dword v0, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0)
@@ -1013,29 +620,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_1-NEXT:    s_mov_b32 s5, s33
 ; GFX10_1-NEXT:    s_mov_b32 s33, s32
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
-; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
-; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x80800
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    s_mov_b32 s32, s33
-; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
-; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_1-NEXT:    s_mov_b32 s33, s5
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
-; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_mov_b32 s33, s5
-; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1043,27 +637,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_3-NEXT:    s_mov_b32 s5, s33
 ; GFX10_3-NEXT:    s_mov_b32 s33, s32
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
-; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
-; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x80800
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    s_mov_b32 s32, s33
-; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
-; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_3-NEXT:    s_mov_b32 s33, s5
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
-; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_mov_b32 s33, s5
-; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1071,29 +654,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s1, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
-; GFX11-NEXT:    scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_addk_i32 s32, 0x4080
+; GFX11-NEXT:    s_addk_i32 s32, 0x4040
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX11-NEXT:    s_addc_u32 s0, s33, 64
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX11-NEXT:    s_bitset0_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s33, s1
 ; GFX11-NEXT:    s_mov_b32 s59, s0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
-; GFX11-NEXT:    scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_mov_b32 s33, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1105,25 +676,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s1, s33
 ; GFX12-NEXT:    s_mov_b32 s33, s32
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
-; GFX12-NEXT:    s_mov_b32 s59, s33
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s59, s33
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX12-NEXT:    s_mov_b32 s32, s33
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_mov_b32 s33, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1132,28 +693,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s7, s33, 0x101000
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX8-NEXT:    s_mov_b32 s59, 64
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s59, v1
-; GFX8-NEXT:    s_add_i32 s32, s32, 0x102000
-; GFX8-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x101000
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s59, v0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s7, s33, 0x101000
-; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_mov_b32 s33, s6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1161,27 +711,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_mov_b32 s6, s33
 ; GFX900-NEXT:    s_mov_b32 s33, s32
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s7, s33, 0x101000
-; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
-; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
-; GFX900-NEXT:    s_add_i32 s32, s32, 0x102000
-; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX900-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX900-NEXT:    s_add_i32 s32, s32, 0x101000
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s7, s33, 0x101000
-; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    s_mov_b32 s33, s6
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1189,28 +728,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b32 s2, s33
 ; GFX942-NEXT:    s_mov_b32 s33, s32
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s3, s33, 0x4040
-; GFX942-NEXT:    scratch_store_dword off, v0, s3 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_addk_i32 s32, 0x4080
+; GFX942-NEXT:    s_addk_i32 s32, 0x4040
 ; GFX942-NEXT:    s_and_b64 s[0:1], 0, exec
 ; GFX942-NEXT:    s_addc_u32 s0, s33, 64
 ; GFX942-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX942-NEXT:    s_bitset0_b32 s0, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX942-NEXT:    s_mov_b32 s59, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s59, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s32, s33
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s3, s33, 0x4040
-; GFX942-NEXT:    scratch_load_dword v0, off, s3 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_mov_b32 s33, s2
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0)
@@ -1223,27 +751,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_1-NEXT:    s_mov_b32 s4, s33
 ; GFX10_1-NEXT:    s_mov_b32 s33, s32
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s5, -1
-; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
-; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s5
-; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x80800
 ; GFX10_1-NEXT:    s_lshr_b32 s59, s33, 5
 ; GFX10_1-NEXT:    s_mov_b32 s32, s33
 ; GFX10_1-NEXT:    s_add_i32 s59, s59, 64
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s5, -1
-; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
-; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10_1-NEXT:    s_mov_b32 s33, s4
-; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1251,25 +766,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_3-NEXT:    s_mov_b32 s4, s33
 ; GFX10_3-NEXT:    s_mov_b32 s33, s32
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s5, -1
-; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
-; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s5
-; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x80800
 ; GFX10_3-NEXT:    s_lshr_b32 s59, s33, 5
 ; GFX10_3-NEXT:    s_mov_b32 s32, s33
 ; GFX10_3-NEXT:    s_add_i32 s59, s59, 64
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s5, -1
-; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
-; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10_3-NEXT:    s_mov_b32 s33, s4
-; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1277,25 +781,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
-; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
-; GFX11-NEXT:    scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX11-NEXT:    s_addk_i32 s32, 0x4080
+; GFX11-NEXT:    s_addk_i32 s32, 0x4040
 ; GFX11-NEXT:    s_add_i32 s1, s33, 64
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_mov_b32 s59, s1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
-; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
-; GFX11-NEXT:    scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1307,24 +800,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, s33
 ; GFX12-NEXT:    s_mov_b32 s33, s32
-; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
-; GFX12-NEXT:    scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 s59, s33
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_mov_b32 s32, s33
-; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
-; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
-; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    s_mov_b32 s33, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1333,25 +816,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX8-NEXT:    s_add_i32 s5, s33, 0x101000
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    s_add_i32 s32, s32, 0x102000
-; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x101000
 ; GFX8-NEXT:    s_lshr_b32 s59, s33, 6
 ; GFX8-NEXT:    s_add_i32 s59, s59, 64
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s59
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
-; GFX8-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX8-NEXT:    s_add_i32 s5, s33, 0x101000
-; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b32 s33, s4
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1359,25 +831,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_mov_b32 s4, s33
 ; GFX900-NEXT:    s_mov_b32 s33, s32
-; GFX900-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX900-NEXT:    s_add_i32 s5, s33, 0x101000
-; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX900-NEXT:    s_add_i32 s32, s32, 0x102000
-; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX900-NEXT:    s_add_i32 s32, s32, 0x101000
 ; GFX900-NEXT:    s_lshr_b32 s59, s33, 6
 ; GFX900-NEXT:    s_add_i32 s59, s59, 64
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s59
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
-; GFX900-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX900-NEXT:    s_add_i32 s5, s33, 0x101000
-; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX900-NEXT:    s_mov_b32 s33, s4
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1385,25 +846,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b32 s0, s33
 ; GFX942-NEXT:    s_mov_b32 s33, s32
-; GFX942-NEXT:    s_xor_saveexec_b64 s[2:3], -1
-; GFX942-NEXT:    s_add_i32 s1, s33, 0x4040
-; GFX942-NEXT:    scratch_store_dword off, v0, s1 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-NEXT:    s_addk_i32 s32, 0x4080
+; GFX942-NEXT:    s_addk_i32 s32, 0x4040
 ; GFX942-NEXT:    s_add_i32 s1, s33, 64
-; GFX942-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX942-NEXT:    s_mov_b32 s59, s1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s59
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s32, s33
-; GFX942-NEXT:    s_xor_saveexec_b64 s[2:3], -1
-; GFX942-NEXT:    s_add_i32 s1, s33, 0x4040
-; GFX942-NEXT:    scratch_load_dword v0, off, s1 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    s_mov_b32 s33, s0
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0)
@@ -1414,12 +864,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
 ; GFX10_1:       ; %bb.0:
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
-; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_lshr_b32 s4, s32, 5
 ; GFX10_1-NEXT:    s_add_i32 s59, s4, 0x442c
@@ -1431,23 +875,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
-; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
 ; GFX10_3:       ; %bb.0:
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
-; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_lshr_b32 s4, s32, 5
 ; GFX10_3-NEXT:    s_add_i32 s59, s4, 0x442c
@@ -1459,22 +891,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
-; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
-; GFX11-NEXT:    scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
 ; GFX11-NEXT:    s_add_i32 s59, s32, 0x442c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
@@ -1485,12 +906,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
-; GFX11-NEXT:    scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
@@ -1500,11 +915,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX12-NEXT:    s_add_co_i32 s59, s32, 0x43ec
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
@@ -1514,23 +924,13 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_lshr_b32 s4, s32, 6
-; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX8-NEXT:    s_add_i32 s59, s4, 0x442c
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
@@ -1541,23 +941,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    s_lshr_b32 s4, s32, 6
-; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX900-NEXT:    s_add_i32 s59, s4, 0x442c
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
@@ -1568,22 +957,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x8040
-; GFX942-NEXT:    scratch_store_dword off, v1, s2 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX942-NEXT:    s_add_i32 s59, s32, 0x442c
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
@@ -1594,12 +972,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s59, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x8040
-; GFX942-NEXT:    scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   %alloca1 = alloca [4096 x i32], align 4, addrspace(5)
@@ -1613,12 +985,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
 ; GFX10_1:       ; %bb.0:
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
-; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_lshl_b32 s4, s16, 2
 ; GFX10_1-NEXT:    s_lshr_b32 s59, s32, 5
@@ -1632,23 +998,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
-; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
 ; GFX10_3:       ; %bb.0:
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
-; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_lshl_b32 s4, s16, 2
 ; GFX10_3-NEXT:    s_lshr_b32 s59, s32, 5
@@ -1662,23 +1016,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
-; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
-; GFX11-NEXT:    s_add_i32 s2, s32, 0x8040
-; GFX11-NEXT:    scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 64
-; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-NEXT:    s_add_i32 s59, s32, s0
@@ -1690,12 +1033,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
-; GFX11-NEXT:    scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
@@ -1705,11 +1042,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
-; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1723,22 +1055,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX8-NEXT:    s_lshl_b32 s4, s16, 2
 ; GFX8-NEXT:    s_lshr_b32 s59, s32, 6
 ; GFX8-NEXT:    s_add_i32 s59, s59, s4
@@ -1752,22 +1074,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX900-NEXT:    s_lshl_b32 s4, s16, 2
 ; GFX900-NEXT:    s_lshr_b32 s59, s32, 6
 ; GFX900-NEXT:    s_add_i32 s59, s59, s4
@@ -1781,23 +1092,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[2:3], -1
-; GFX942-NEXT:    s_add_i32 s1, s32, 0x8040
-; GFX942-NEXT:    scratch_store_dword off, v1, s1 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX942-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX942-NEXT:    s_add_i32 s59, s32, s0
 ; GFX942-NEXT:    s_addk_i32 s59, 0x4040
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
@@ -1809,12 +1109,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s59, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s59, v1, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    s_add_i32 s2, s32, 0x8040
-; GFX942-NEXT:    scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   %alloca1 = alloca [4096 x i32], align 4, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 2420393b63ba9..e8dacc93a8f3c 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -46,28 +46,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
 ; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
 ; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v23, s40, 9
-; GFX7-NEXT:    v_writelane_b32 v23, s41, 10
-; GFX7-NEXT:    v_writelane_b32 v23, s42, 11
-; GFX7-NEXT:    v_writelane_b32 v23, s43, 12
-; GFX7-NEXT:    v_writelane_b32 v23, s44, 13
-; GFX7-NEXT:    v_writelane_b32 v23, s45, 14
-; GFX7-NEXT:    v_writelane_b32 v23, s46, 15
-; GFX7-NEXT:    v_writelane_b32 v23, s47, 16
-; GFX7-NEXT:    v_writelane_b32 v23, s48, 17
-; GFX7-NEXT:    v_writelane_b32 v23, s49, 18
-; GFX7-NEXT:    v_writelane_b32 v23, s50, 19
-; GFX7-NEXT:    v_writelane_b32 v23, s51, 20
-; GFX7-NEXT:    v_writelane_b32 v23, s52, 21
-; GFX7-NEXT:    v_writelane_b32 v23, s53, 22
-; GFX7-NEXT:    v_writelane_b32 v23, s54, 23
-; GFX7-NEXT:    v_writelane_b32 v23, s55, 24
-; GFX7-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 14
 ; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 15
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use alloca0 v0
 ; GFX7-NEXT:    ;;#ASMEND
@@ -78,33 +67,20 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0x4040
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, 64, s32
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 6, v0
-; GFX7-NEXT:    v_writelane_b32 v23, s59, 28
 ; GFX7-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s59, v23, 28
-; GFX7-NEXT:    v_readlane_b32 s58, v23, 27
-; GFX7-NEXT:    v_readlane_b32 s57, v23, 26
-; GFX7-NEXT:    v_readlane_b32 s56, v23, 25
-; GFX7-NEXT:    v_readlane_b32 s55, v23, 24
-; GFX7-NEXT:    v_readlane_b32 s54, v23, 23
-; GFX7-NEXT:    v_readlane_b32 s53, v23, 22
-; GFX7-NEXT:    v_readlane_b32 s52, v23, 21
-; GFX7-NEXT:    v_readlane_b32 s51, v23, 20
-; GFX7-NEXT:    v_readlane_b32 s50, v23, 19
-; GFX7-NEXT:    v_readlane_b32 s49, v23, 18
-; GFX7-NEXT:    v_readlane_b32 s48, v23, 17
-; GFX7-NEXT:    v_readlane_b32 s47, v23, 16
-; GFX7-NEXT:    v_readlane_b32 s46, v23, 15
-; GFX7-NEXT:    v_readlane_b32 s45, v23, 14
-; GFX7-NEXT:    v_readlane_b32 s44, v23, 13
-; GFX7-NEXT:    v_readlane_b32 s43, v23, 12
-; GFX7-NEXT:    v_readlane_b32 s42, v23, 11
-; GFX7-NEXT:    v_readlane_b32 s41, v23, 10
-; GFX7-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 9
 ; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
 ; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
 ; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
@@ -137,28 +113,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
 ; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
 ; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v23, s40, 9
-; GFX8-NEXT:    v_writelane_b32 v23, s41, 10
-; GFX8-NEXT:    v_writelane_b32 v23, s42, 11
-; GFX8-NEXT:    v_writelane_b32 v23, s43, 12
-; GFX8-NEXT:    v_writelane_b32 v23, s44, 13
-; GFX8-NEXT:    v_writelane_b32 v23, s45, 14
-; GFX8-NEXT:    v_writelane_b32 v23, s46, 15
-; GFX8-NEXT:    v_writelane_b32 v23, s47, 16
-; GFX8-NEXT:    v_writelane_b32 v23, s48, 17
-; GFX8-NEXT:    v_writelane_b32 v23, s49, 18
-; GFX8-NEXT:    v_writelane_b32 v23, s50, 19
-; GFX8-NEXT:    v_writelane_b32 v23, s51, 20
-; GFX8-NEXT:    v_writelane_b32 v23, s52, 21
-; GFX8-NEXT:    v_writelane_b32 v23, s53, 22
-; GFX8-NEXT:    v_writelane_b32 v23, s54, 23
-; GFX8-NEXT:    v_writelane_b32 v23, s55, 24
-; GFX8-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 14
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 15
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
@@ -169,33 +134,20 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x4040
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 64, s32
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 6, v0
-; GFX8-NEXT:    v_writelane_b32 v23, s59, 28
 ; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v23, 28
-; GFX8-NEXT:    v_readlane_b32 s58, v23, 27
-; GFX8-NEXT:    v_readlane_b32 s57, v23, 26
-; GFX8-NEXT:    v_readlane_b32 s56, v23, 25
-; GFX8-NEXT:    v_readlane_b32 s55, v23, 24
-; GFX8-NEXT:    v_readlane_b32 s54, v23, 23
-; GFX8-NEXT:    v_readlane_b32 s53, v23, 22
-; GFX8-NEXT:    v_readlane_b32 s52, v23, 21
-; GFX8-NEXT:    v_readlane_b32 s51, v23, 20
-; GFX8-NEXT:    v_readlane_b32 s50, v23, 19
-; GFX8-NEXT:    v_readlane_b32 s49, v23, 18
-; GFX8-NEXT:    v_readlane_b32 s48, v23, 17
-; GFX8-NEXT:    v_readlane_b32 s47, v23, 16
-; GFX8-NEXT:    v_readlane_b32 s46, v23, 15
-; GFX8-NEXT:    v_readlane_b32 s45, v23, 14
-; GFX8-NEXT:    v_readlane_b32 s44, v23, 13
-; GFX8-NEXT:    v_readlane_b32 s43, v23, 12
-; GFX8-NEXT:    v_readlane_b32 s42, v23, 11
-; GFX8-NEXT:    v_readlane_b32 s41, v23, 10
-; GFX8-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 9
 ; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
 ; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
 ; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
@@ -228,28 +180,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX900-NEXT:    v_writelane_b32 v23, s37, 6
 ; GFX900-NEXT:    v_writelane_b32 v23, s38, 7
 ; GFX900-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v23, s40, 9
-; GFX900-NEXT:    v_writelane_b32 v23, s41, 10
-; GFX900-NEXT:    v_writelane_b32 v23, s42, 11
-; GFX900-NEXT:    v_writelane_b32 v23, s43, 12
-; GFX900-NEXT:    v_writelane_b32 v23, s44, 13
-; GFX900-NEXT:    v_writelane_b32 v23, s45, 14
-; GFX900-NEXT:    v_writelane_b32 v23, s46, 15
-; GFX900-NEXT:    v_writelane_b32 v23, s47, 16
-; GFX900-NEXT:    v_writelane_b32 v23, s48, 17
-; GFX900-NEXT:    v_writelane_b32 v23, s49, 18
-; GFX900-NEXT:    v_writelane_b32 v23, s50, 19
-; GFX900-NEXT:    v_writelane_b32 v23, s51, 20
-; GFX900-NEXT:    v_writelane_b32 v23, s52, 21
-; GFX900-NEXT:    v_writelane_b32 v23, s53, 22
-; GFX900-NEXT:    v_writelane_b32 v23, s54, 23
-; GFX900-NEXT:    v_writelane_b32 v23, s55, 24
-; GFX900-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX900-NEXT:    v_writelane_b32 v23, s48, 9
+; GFX900-NEXT:    v_writelane_b32 v23, s49, 10
+; GFX900-NEXT:    v_writelane_b32 v23, s50, 11
+; GFX900-NEXT:    v_writelane_b32 v23, s51, 12
+; GFX900-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX900-NEXT:    v_writelane_b32 v23, s53, 14
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX900-NEXT:    v_writelane_b32 v23, s54, 15
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX900-NEXT:    v_writelane_b32 v23, s55, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
@@ -259,33 +200,20 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
-; GFX900-NEXT:    v_writelane_b32 v23, s59, 28
 ; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v23, 28
-; GFX900-NEXT:    v_readlane_b32 s58, v23, 27
-; GFX900-NEXT:    v_readlane_b32 s57, v23, 26
-; GFX900-NEXT:    v_readlane_b32 s56, v23, 25
-; GFX900-NEXT:    v_readlane_b32 s55, v23, 24
-; GFX900-NEXT:    v_readlane_b32 s54, v23, 23
-; GFX900-NEXT:    v_readlane_b32 s53, v23, 22
-; GFX900-NEXT:    v_readlane_b32 s52, v23, 21
-; GFX900-NEXT:    v_readlane_b32 s51, v23, 20
-; GFX900-NEXT:    v_readlane_b32 s50, v23, 19
-; GFX900-NEXT:    v_readlane_b32 s49, v23, 18
-; GFX900-NEXT:    v_readlane_b32 s48, v23, 17
-; GFX900-NEXT:    v_readlane_b32 s47, v23, 16
-; GFX900-NEXT:    v_readlane_b32 s46, v23, 15
-; GFX900-NEXT:    v_readlane_b32 s45, v23, 14
-; GFX900-NEXT:    v_readlane_b32 s44, v23, 13
-; GFX900-NEXT:    v_readlane_b32 s43, v23, 12
-; GFX900-NEXT:    v_readlane_b32 s42, v23, 11
-; GFX900-NEXT:    v_readlane_b32 s41, v23, 10
-; GFX900-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX900-NEXT:    v_readlane_b32 s55, v23, 16
+; GFX900-NEXT:    v_readlane_b32 s54, v23, 15
+; GFX900-NEXT:    v_readlane_b32 s53, v23, 14
+; GFX900-NEXT:    v_readlane_b32 s52, v23, 13
+; GFX900-NEXT:    v_readlane_b32 s51, v23, 12
+; GFX900-NEXT:    v_readlane_b32 s50, v23, 11
+; GFX900-NEXT:    v_readlane_b32 s49, v23, 10
+; GFX900-NEXT:    v_readlane_b32 s48, v23, 9
 ; GFX900-NEXT:    v_readlane_b32 s39, v23, 8
 ; GFX900-NEXT:    v_readlane_b32 s38, v23, 7
 ; GFX900-NEXT:    v_readlane_b32 s37, v23, 6
@@ -318,31 +246,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX942-NEXT:    v_writelane_b32 v23, s37, 6
 ; GFX942-NEXT:    v_writelane_b32 v23, s38, 7
 ; GFX942-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v23, s40, 9
-; GFX942-NEXT:    v_writelane_b32 v23, s41, 10
-; GFX942-NEXT:    v_writelane_b32 v23, s42, 11
-; GFX942-NEXT:    v_writelane_b32 v23, s43, 12
-; GFX942-NEXT:    v_writelane_b32 v23, s44, 13
-; GFX942-NEXT:    v_writelane_b32 v23, s45, 14
-; GFX942-NEXT:    v_writelane_b32 v23, s46, 15
-; GFX942-NEXT:    v_writelane_b32 v23, s47, 16
-; GFX942-NEXT:    v_writelane_b32 v23, s48, 17
-; GFX942-NEXT:    v_writelane_b32 v23, s49, 18
-; GFX942-NEXT:    v_writelane_b32 v23, s50, 19
-; GFX942-NEXT:    v_writelane_b32 v23, s51, 20
-; GFX942-NEXT:    v_writelane_b32 v23, s52, 21
-; GFX942-NEXT:    v_writelane_b32 v23, s53, 22
-; GFX942-NEXT:    v_writelane_b32 v23, s54, 23
-; GFX942-NEXT:    v_writelane_b32 v23, s55, 24
-; GFX942-NEXT:    v_writelane_b32 v23, s56, 25
-; GFX942-NEXT:    v_writelane_b32 v23, s57, 26
-; GFX942-NEXT:    v_writelane_b32 v23, s58, 27
-; GFX942-NEXT:    v_writelane_b32 v23, s59, 28
-; GFX942-NEXT:    v_writelane_b32 v23, s60, 29
+; GFX942-NEXT:    v_writelane_b32 v23, s48, 9
+; GFX942-NEXT:    v_writelane_b32 v23, s49, 10
+; GFX942-NEXT:    v_writelane_b32 v23, s50, 11
+; GFX942-NEXT:    v_writelane_b32 v23, s51, 12
+; GFX942-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX942-NEXT:    v_writelane_b32 v23, s53, 14
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
-; GFX942-NEXT:    v_writelane_b32 v23, s61, 30
+; GFX942-NEXT:    v_writelane_b32 v23, s54, 15
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NEXT:    s_and_b64 s[60:61], 0, exec
+; GFX942-NEXT:    v_writelane_b32 v23, s55, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
@@ -356,28 +270,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s61, v23, 30
-; GFX942-NEXT:    v_readlane_b32 s60, v23, 29
-; GFX942-NEXT:    v_readlane_b32 s59, v23, 28
-; GFX942-NEXT:    v_readlane_b32 s58, v23, 27
-; GFX942-NEXT:    v_readlane_b32 s57, v23, 26
-; GFX942-NEXT:    v_readlane_b32 s56, v23, 25
-; GFX942-NEXT:    v_readlane_b32 s55, v23, 24
-; GFX942-NEXT:    v_readlane_b32 s54, v23, 23
-; GFX942-NEXT:    v_readlane_b32 s53, v23, 22
-; GFX942-NEXT:    v_readlane_b32 s52, v23, 21
-; GFX942-NEXT:    v_readlane_b32 s51, v23, 20
-; GFX942-NEXT:    v_readlane_b32 s50, v23, 19
-; GFX942-NEXT:    v_readlane_b32 s49, v23, 18
-; GFX942-NEXT:    v_readlane_b32 s48, v23, 17
-; GFX942-NEXT:    v_readlane_b32 s47, v23, 16
-; GFX942-NEXT:    v_readlane_b32 s46, v23, 15
-; GFX942-NEXT:    v_readlane_b32 s45, v23, 14
-; GFX942-NEXT:    v_readlane_b32 s44, v23, 13
-; GFX942-NEXT:    v_readlane_b32 s43, v23, 12
-; GFX942-NEXT:    v_readlane_b32 s42, v23, 11
-; GFX942-NEXT:    v_readlane_b32 s41, v23, 10
-; GFX942-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX942-NEXT:    v_readlane_b32 s55, v23, 16
+; GFX942-NEXT:    v_readlane_b32 s54, v23, 15
+; GFX942-NEXT:    v_readlane_b32 s53, v23, 14
+; GFX942-NEXT:    v_readlane_b32 s52, v23, 13
+; GFX942-NEXT:    v_readlane_b32 s51, v23, 12
+; GFX942-NEXT:    v_readlane_b32 s50, v23, 11
+; GFX942-NEXT:    v_readlane_b32 s49, v23, 10
+; GFX942-NEXT:    v_readlane_b32 s48, v23, 9
 ; GFX942-NEXT:    v_readlane_b32 s39, v23, 8
 ; GFX942-NEXT:    v_readlane_b32 s38, v23, 7
 ; GFX942-NEXT:    v_readlane_b32 s37, v23, 6
@@ -417,55 +317,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 6
 ; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 7
 ; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v23, s40, 9
-; GFX10_1-NEXT:    v_writelane_b32 v23, s41, 10
-; GFX10_1-NEXT:    v_writelane_b32 v23, s42, 11
-; GFX10_1-NEXT:    v_writelane_b32 v23, s43, 12
-; GFX10_1-NEXT:    v_writelane_b32 v23, s44, 13
-; GFX10_1-NEXT:    v_writelane_b32 v23, s45, 14
-; GFX10_1-NEXT:    v_writelane_b32 v23, s46, 15
-; GFX10_1-NEXT:    v_writelane_b32 v23, s47, 16
-; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 17
-; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 18
-; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 19
-; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 20
-; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 21
-; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 22
-; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 23
-; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 24
-; GFX10_1-NEXT:    v_writelane_b32 v23, s56, 25
-; GFX10_1-NEXT:    v_writelane_b32 v23, s57, 26
-; GFX10_1-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 9
+; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 10
+; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 11
+; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 12
+; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v24, 5, s32
-; GFX10_1-NEXT:    v_writelane_b32 v23, s59, 28
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v24, 0x4040, v24
 ; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v24
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v23, 28
-; GFX10_1-NEXT:    v_readlane_b32 s58, v23, 27
-; GFX10_1-NEXT:    v_readlane_b32 s57, v23, 26
-; GFX10_1-NEXT:    v_readlane_b32 s56, v23, 25
-; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 24
-; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 23
-; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 22
-; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 21
-; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 20
-; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 19
-; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 18
-; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 17
-; GFX10_1-NEXT:    v_readlane_b32 s47, v23, 16
-; GFX10_1-NEXT:    v_readlane_b32 s46, v23, 15
-; GFX10_1-NEXT:    v_readlane_b32 s45, v23, 14
-; GFX10_1-NEXT:    v_readlane_b32 s44, v23, 13
-; GFX10_1-NEXT:    v_readlane_b32 s43, v23, 12
-; GFX10_1-NEXT:    v_readlane_b32 s42, v23, 11
-; GFX10_1-NEXT:    v_readlane_b32 s41, v23, 10
-; GFX10_1-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 16
+; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 15
+; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 14
+; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 13
+; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 12
+; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 11
+; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 10
+; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 9
 ; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 8
 ; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 7
 ; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 6
@@ -505,55 +381,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 6
 ; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 7
 ; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v23, s40, 9
-; GFX10_3-NEXT:    v_writelane_b32 v23, s41, 10
-; GFX10_3-NEXT:    v_writelane_b32 v23, s42, 11
-; GFX10_3-NEXT:    v_writelane_b32 v23, s43, 12
-; GFX10_3-NEXT:    v_writelane_b32 v23, s44, 13
-; GFX10_3-NEXT:    v_writelane_b32 v23, s45, 14
-; GFX10_3-NEXT:    v_writelane_b32 v23, s46, 15
-; GFX10_3-NEXT:    v_writelane_b32 v23, s47, 16
-; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 17
-; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 18
-; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 19
-; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 20
-; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 21
-; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 22
-; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 23
-; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 24
-; GFX10_3-NEXT:    v_writelane_b32 v23, s56, 25
-; GFX10_3-NEXT:    v_writelane_b32 v23, s57, 26
-; GFX10_3-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 9
+; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 10
+; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 11
+; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 12
+; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v24, 5, s32
-; GFX10_3-NEXT:    v_writelane_b32 v23, s59, 28
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v24, 0x4040, v24
 ; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v24
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v23, 28
-; GFX10_3-NEXT:    v_readlane_b32 s58, v23, 27
-; GFX10_3-NEXT:    v_readlane_b32 s57, v23, 26
-; GFX10_3-NEXT:    v_readlane_b32 s56, v23, 25
-; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 24
-; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 23
-; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 22
-; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 21
-; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 20
-; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 19
-; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 18
-; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 17
-; GFX10_3-NEXT:    v_readlane_b32 s47, v23, 16
-; GFX10_3-NEXT:    v_readlane_b32 s46, v23, 15
-; GFX10_3-NEXT:    v_readlane_b32 s45, v23, 14
-; GFX10_3-NEXT:    v_readlane_b32 s44, v23, 13
-; GFX10_3-NEXT:    v_readlane_b32 s43, v23, 12
-; GFX10_3-NEXT:    v_readlane_b32 s42, v23, 11
-; GFX10_3-NEXT:    v_readlane_b32 s41, v23, 10
-; GFX10_3-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 16
+; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 15
+; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 14
+; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 13
+; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 12
+; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 11
+; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 10
+; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 9
 ; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 8
 ; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 7
 ; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 6
@@ -593,61 +445,33 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX11-NEXT:    v_writelane_b32 v23, s37, 6
 ; GFX11-NEXT:    v_writelane_b32 v23, s38, 7
 ; GFX11-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v23, s40, 9
-; GFX11-NEXT:    v_writelane_b32 v23, s41, 10
-; GFX11-NEXT:    v_writelane_b32 v23, s42, 11
-; GFX11-NEXT:    v_writelane_b32 v23, s43, 12
-; GFX11-NEXT:    v_writelane_b32 v23, s44, 13
-; GFX11-NEXT:    v_writelane_b32 v23, s45, 14
-; GFX11-NEXT:    v_writelane_b32 v23, s46, 15
-; GFX11-NEXT:    v_writelane_b32 v23, s47, 16
-; GFX11-NEXT:    v_writelane_b32 v23, s48, 17
-; GFX11-NEXT:    v_writelane_b32 v23, s49, 18
-; GFX11-NEXT:    v_writelane_b32 v23, s50, 19
-; GFX11-NEXT:    v_writelane_b32 v23, s51, 20
-; GFX11-NEXT:    v_writelane_b32 v23, s52, 21
-; GFX11-NEXT:    v_writelane_b32 v23, s53, 22
-; GFX11-NEXT:    v_writelane_b32 v23, s54, 23
-; GFX11-NEXT:    v_writelane_b32 v23, s55, 24
-; GFX11-NEXT:    v_writelane_b32 v23, s56, 25
-; GFX11-NEXT:    v_writelane_b32 v23, s57, 26
-; GFX11-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 9
+; GFX11-NEXT:    v_writelane_b32 v23, s49, 10
+; GFX11-NEXT:    v_writelane_b32 v23, s50, 11
+; GFX11-NEXT:    v_writelane_b32 v23, s51, 12
+; GFX11-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX11-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX11-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX11-NEXT:    v_writelane_b32 v23, s55, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_addc_u32 s32, s32, 0x4040
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
-; GFX11-NEXT:    v_writelane_b32 v23, s59, 28
-; GFX11-NEXT:    s_bitset0_b32 s32, 0
-; GFX11-NEXT:    s_mov_b32 s59, s32
-; GFX11-NEXT:    s_addc_u32 s32, s32, 0xffffbfc0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
-; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    s_addc_u32 s60, s32, 0x4040
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s60, 0
+; GFX11-NEXT:    s_bitset0_b32 s60, 0
+; GFX11-NEXT:    s_mov_b32 s59, s60
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v23, 28
-; GFX11-NEXT:    v_readlane_b32 s58, v23, 27
-; GFX11-NEXT:    v_readlane_b32 s57, v23, 26
-; GFX11-NEXT:    v_readlane_b32 s56, v23, 25
-; GFX11-NEXT:    v_readlane_b32 s55, v23, 24
-; GFX11-NEXT:    v_readlane_b32 s54, v23, 23
-; GFX11-NEXT:    v_readlane_b32 s53, v23, 22
-; GFX11-NEXT:    v_readlane_b32 s52, v23, 21
-; GFX11-NEXT:    v_readlane_b32 s51, v23, 20
-; GFX11-NEXT:    v_readlane_b32 s50, v23, 19
-; GFX11-NEXT:    v_readlane_b32 s49, v23, 18
-; GFX11-NEXT:    v_readlane_b32 s48, v23, 17
-; GFX11-NEXT:    v_readlane_b32 s47, v23, 16
-; GFX11-NEXT:    v_readlane_b32 s46, v23, 15
-; GFX11-NEXT:    v_readlane_b32 s45, v23, 14
-; GFX11-NEXT:    v_readlane_b32 s44, v23, 13
-; GFX11-NEXT:    v_readlane_b32 s43, v23, 12
-; GFX11-NEXT:    v_readlane_b32 s42, v23, 11
-; GFX11-NEXT:    v_readlane_b32 s41, v23, 10
-; GFX11-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX11-NEXT:    v_readlane_b32 s55, v23, 16
+; GFX11-NEXT:    v_readlane_b32 s54, v23, 15
+; GFX11-NEXT:    v_readlane_b32 s53, v23, 14
+; GFX11-NEXT:    v_readlane_b32 s52, v23, 13
+; GFX11-NEXT:    v_readlane_b32 s51, v23, 12
+; GFX11-NEXT:    v_readlane_b32 s50, v23, 11
+; GFX11-NEXT:    v_readlane_b32 s49, v23, 10
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 9
 ; GFX11-NEXT:    v_readlane_b32 s39, v23, 8
 ; GFX11-NEXT:    v_readlane_b32 s38, v23, 7
 ; GFX11-NEXT:    v_readlane_b32 s37, v23, 6
@@ -689,62 +513,34 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    v_writelane_b32 v23, s37, 6
 ; GFX12-NEXT:    v_writelane_b32 v23, s38, 7
 ; GFX12-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v23, s40, 9
-; GFX12-NEXT:    v_writelane_b32 v23, s41, 10
-; GFX12-NEXT:    v_writelane_b32 v23, s42, 11
-; GFX12-NEXT:    v_writelane_b32 v23, s43, 12
-; GFX12-NEXT:    v_writelane_b32 v23, s44, 13
-; GFX12-NEXT:    v_writelane_b32 v23, s45, 14
-; GFX12-NEXT:    v_writelane_b32 v23, s46, 15
-; GFX12-NEXT:    v_writelane_b32 v23, s47, 16
-; GFX12-NEXT:    v_writelane_b32 v23, s48, 17
-; GFX12-NEXT:    v_writelane_b32 v23, s49, 18
-; GFX12-NEXT:    v_writelane_b32 v23, s50, 19
-; GFX12-NEXT:    v_writelane_b32 v23, s51, 20
-; GFX12-NEXT:    v_writelane_b32 v23, s52, 21
-; GFX12-NEXT:    v_writelane_b32 v23, s53, 22
-; GFX12-NEXT:    v_writelane_b32 v23, s54, 23
-; GFX12-NEXT:    v_writelane_b32 v23, s55, 24
-; GFX12-NEXT:    v_writelane_b32 v23, s56, 25
-; GFX12-NEXT:    v_writelane_b32 v23, s57, 26
-; GFX12-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX12-NEXT:    v_writelane_b32 v23, s48, 9
+; GFX12-NEXT:    v_writelane_b32 v23, s49, 10
+; GFX12-NEXT:    v_writelane_b32 v23, s50, 11
+; GFX12-NEXT:    v_writelane_b32 v23, s51, 12
+; GFX12-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX12-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX12-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX12-NEXT:    v_writelane_b32 v23, s55, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    s_add_co_ci_u32 s32, s32, 0x4000
+; GFX12-NEXT:    s_add_co_ci_u32 s60, s32, 0x4000
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_bitcmp1_b32 s32, 0
-; GFX12-NEXT:    v_writelane_b32 v23, s59, 28
-; GFX12-NEXT:    s_bitset0_b32 s32, 0
+; GFX12-NEXT:    s_bitcmp1_b32 s60, 0
+; GFX12-NEXT:    s_bitset0_b32 s60, 0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s59, s32
-; GFX12-NEXT:    s_add_co_ci_u32 s32, s32, 0xffffc000
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_bitcmp1_b32 s32, 0
-; GFX12-NEXT:    s_bitset0_b32 s32, 0
+; GFX12-NEXT:    s_mov_b32 s59, s60
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s59, v23, 28
-; GFX12-NEXT:    v_readlane_b32 s58, v23, 27
-; GFX12-NEXT:    v_readlane_b32 s57, v23, 26
-; GFX12-NEXT:    v_readlane_b32 s56, v23, 25
-; GFX12-NEXT:    v_readlane_b32 s55, v23, 24
-; GFX12-NEXT:    v_readlane_b32 s54, v23, 23
-; GFX12-NEXT:    v_readlane_b32 s53, v23, 22
-; GFX12-NEXT:    v_readlane_b32 s52, v23, 21
-; GFX12-NEXT:    v_readlane_b32 s51, v23, 20
-; GFX12-NEXT:    v_readlane_b32 s50, v23, 19
-; GFX12-NEXT:    v_readlane_b32 s49, v23, 18
-; GFX12-NEXT:    v_readlane_b32 s48, v23, 17
-; GFX12-NEXT:    v_readlane_b32 s47, v23, 16
-; GFX12-NEXT:    v_readlane_b32 s46, v23, 15
-; GFX12-NEXT:    v_readlane_b32 s45, v23, 14
-; GFX12-NEXT:    v_readlane_b32 s44, v23, 13
-; GFX12-NEXT:    v_readlane_b32 s43, v23, 12
-; GFX12-NEXT:    v_readlane_b32 s42, v23, 11
-; GFX12-NEXT:    v_readlane_b32 s41, v23, 10
-; GFX12-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX12-NEXT:    v_readlane_b32 s55, v23, 16
+; GFX12-NEXT:    v_readlane_b32 s54, v23, 15
+; GFX12-NEXT:    v_readlane_b32 s53, v23, 14
+; GFX12-NEXT:    v_readlane_b32 s52, v23, 13
+; GFX12-NEXT:    v_readlane_b32 s51, v23, 12
+; GFX12-NEXT:    v_readlane_b32 s50, v23, 11
+; GFX12-NEXT:    v_readlane_b32 s49, v23, 10
+; GFX12-NEXT:    v_readlane_b32 s48, v23, 9
 ; GFX12-NEXT:    v_readlane_b32 s39, v23, 8
 ; GFX12-NEXT:    v_readlane_b32 s38, v23, 7
 ; GFX12-NEXT:    v_readlane_b32 s37, v23, 6
@@ -819,56 +615,32 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX7-NEXT:    v_writelane_b32 v21, s37, 6
 ; GFX7-NEXT:    v_writelane_b32 v21, s38, 7
 ; GFX7-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v21, s40, 9
-; GFX7-NEXT:    v_writelane_b32 v21, s41, 10
-; GFX7-NEXT:    v_writelane_b32 v21, s42, 11
-; GFX7-NEXT:    v_writelane_b32 v21, s43, 12
-; GFX7-NEXT:    v_writelane_b32 v21, s44, 13
-; GFX7-NEXT:    v_writelane_b32 v21, s45, 14
-; GFX7-NEXT:    v_writelane_b32 v21, s46, 15
-; GFX7-NEXT:    v_writelane_b32 v21, s47, 16
-; GFX7-NEXT:    v_writelane_b32 v21, s48, 17
-; GFX7-NEXT:    v_writelane_b32 v21, s49, 18
-; GFX7-NEXT:    v_writelane_b32 v21, s50, 19
-; GFX7-NEXT:    v_writelane_b32 v21, s51, 20
-; GFX7-NEXT:    v_writelane_b32 v21, s52, 21
-; GFX7-NEXT:    v_writelane_b32 v21, s53, 22
-; GFX7-NEXT:    v_writelane_b32 v21, s54, 23
-; GFX7-NEXT:    v_writelane_b32 v21, s55, 24
-; GFX7-NEXT:    v_writelane_b32 v21, s56, 25
-; GFX7-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX7-NEXT:    v_writelane_b32 v21, s48, 9
+; GFX7-NEXT:    v_writelane_b32 v21, s49, 10
+; GFX7-NEXT:    v_writelane_b32 v21, s50, 11
+; GFX7-NEXT:    v_writelane_b32 v21, s51, 12
+; GFX7-NEXT:    v_writelane_b32 v21, s52, 13
+; GFX7-NEXT:    v_writelane_b32 v21, s53, 14
+; GFX7-NEXT:    v_writelane_b32 v21, s54, 15
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX7-NEXT:    v_writelane_b32 v21, s55, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX7-NEXT:    ;;#ASMEND
 ; GFX7-NEXT:    v_mad_u32_u24 v22, 16, 64, s32
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 6, v22
-; GFX7-NEXT:    v_writelane_b32 v21, s59, 28
 ; GFX7-NEXT:    v_readfirstlane_b32 s59, v22
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s59, v21, 28
-; GFX7-NEXT:    v_readlane_b32 s58, v21, 27
-; GFX7-NEXT:    v_readlane_b32 s57, v21, 26
-; GFX7-NEXT:    v_readlane_b32 s56, v21, 25
-; GFX7-NEXT:    v_readlane_b32 s55, v21, 24
-; GFX7-NEXT:    v_readlane_b32 s54, v21, 23
-; GFX7-NEXT:    v_readlane_b32 s53, v21, 22
-; GFX7-NEXT:    v_readlane_b32 s52, v21, 21
-; GFX7-NEXT:    v_readlane_b32 s51, v21, 20
-; GFX7-NEXT:    v_readlane_b32 s50, v21, 19
-; GFX7-NEXT:    v_readlane_b32 s49, v21, 18
-; GFX7-NEXT:    v_readlane_b32 s48, v21, 17
-; GFX7-NEXT:    v_readlane_b32 s47, v21, 16
-; GFX7-NEXT:    v_readlane_b32 s46, v21, 15
-; GFX7-NEXT:    v_readlane_b32 s45, v21, 14
-; GFX7-NEXT:    v_readlane_b32 s44, v21, 13
-; GFX7-NEXT:    v_readlane_b32 s43, v21, 12
-; GFX7-NEXT:    v_readlane_b32 s42, v21, 11
-; GFX7-NEXT:    v_readlane_b32 s41, v21, 10
-; GFX7-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX7-NEXT:    v_readlane_b32 s55, v21, 16
+; GFX7-NEXT:    v_readlane_b32 s54, v21, 15
+; GFX7-NEXT:    v_readlane_b32 s53, v21, 14
+; GFX7-NEXT:    v_readlane_b32 s52, v21, 13
+; GFX7-NEXT:    v_readlane_b32 s51, v21, 12
+; GFX7-NEXT:    v_readlane_b32 s50, v21, 11
+; GFX7-NEXT:    v_readlane_b32 s49, v21, 10
+; GFX7-NEXT:    v_readlane_b32 s48, v21, 9
 ; GFX7-NEXT:    v_readlane_b32 s39, v21, 8
 ; GFX7-NEXT:    v_readlane_b32 s38, v21, 7
 ; GFX7-NEXT:    v_readlane_b32 s37, v21, 6
@@ -901,56 +673,32 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX8-NEXT:    v_writelane_b32 v21, s37, 6
 ; GFX8-NEXT:    v_writelane_b32 v21, s38, 7
 ; GFX8-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v21, s40, 9
-; GFX8-NEXT:    v_writelane_b32 v21, s41, 10
-; GFX8-NEXT:    v_writelane_b32 v21, s42, 11
-; GFX8-NEXT:    v_writelane_b32 v21, s43, 12
-; GFX8-NEXT:    v_writelane_b32 v21, s44, 13
-; GFX8-NEXT:    v_writelane_b32 v21, s45, 14
-; GFX8-NEXT:    v_writelane_b32 v21, s46, 15
-; GFX8-NEXT:    v_writelane_b32 v21, s47, 16
-; GFX8-NEXT:    v_writelane_b32 v21, s48, 17
-; GFX8-NEXT:    v_writelane_b32 v21, s49, 18
-; GFX8-NEXT:    v_writelane_b32 v21, s50, 19
-; GFX8-NEXT:    v_writelane_b32 v21, s51, 20
-; GFX8-NEXT:    v_writelane_b32 v21, s52, 21
-; GFX8-NEXT:    v_writelane_b32 v21, s53, 22
-; GFX8-NEXT:    v_writelane_b32 v21, s54, 23
-; GFX8-NEXT:    v_writelane_b32 v21, s55, 24
-; GFX8-NEXT:    v_writelane_b32 v21, s56, 25
-; GFX8-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX8-NEXT:    v_writelane_b32 v21, s48, 9
+; GFX8-NEXT:    v_writelane_b32 v21, s49, 10
+; GFX8-NEXT:    v_writelane_b32 v21, s50, 11
+; GFX8-NEXT:    v_writelane_b32 v21, s51, 12
+; GFX8-NEXT:    v_writelane_b32 v21, s52, 13
+; GFX8-NEXT:    v_writelane_b32 v21, s53, 14
+; GFX8-NEXT:    v_writelane_b32 v21, s54, 15
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX8-NEXT:    v_writelane_b32 v21, s55, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    v_mad_u32_u24 v22, 16, 64, s32
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 6, v22
-; GFX8-NEXT:    v_writelane_b32 v21, s59, 28
 ; GFX8-NEXT:    v_readfirstlane_b32 s59, v22
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v21, 28
-; GFX8-NEXT:    v_readlane_b32 s58, v21, 27
-; GFX8-NEXT:    v_readlane_b32 s57, v21, 26
-; GFX8-NEXT:    v_readlane_b32 s56, v21, 25
-; GFX8-NEXT:    v_readlane_b32 s55, v21, 24
-; GFX8-NEXT:    v_readlane_b32 s54, v21, 23
-; GFX8-NEXT:    v_readlane_b32 s53, v21, 22
-; GFX8-NEXT:    v_readlane_b32 s52, v21, 21
-; GFX8-NEXT:    v_readlane_b32 s51, v21, 20
-; GFX8-NEXT:    v_readlane_b32 s50, v21, 19
-; GFX8-NEXT:    v_readlane_b32 s49, v21, 18
-; GFX8-NEXT:    v_readlane_b32 s48, v21, 17
-; GFX8-NEXT:    v_readlane_b32 s47, v21, 16
-; GFX8-NEXT:    v_readlane_b32 s46, v21, 15
-; GFX8-NEXT:    v_readlane_b32 s45, v21, 14
-; GFX8-NEXT:    v_readlane_b32 s44, v21, 13
-; GFX8-NEXT:    v_readlane_b32 s43, v21, 12
-; GFX8-NEXT:    v_readlane_b32 s42, v21, 11
-; GFX8-NEXT:    v_readlane_b32 s41, v21, 10
-; GFX8-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX8-NEXT:    v_readlane_b32 s55, v21, 16
+; GFX8-NEXT:    v_readlane_b32 s54, v21, 15
+; GFX8-NEXT:    v_readlane_b32 s53, v21, 14
+; GFX8-NEXT:    v_readlane_b32 s52, v21, 13
+; GFX8-NEXT:    v_readlane_b32 s51, v21, 12
+; GFX8-NEXT:    v_readlane_b32 s50, v21, 11
+; GFX8-NEXT:    v_readlane_b32 s49, v21, 10
+; GFX8-NEXT:    v_readlane_b32 s48, v21, 9
 ; GFX8-NEXT:    v_readlane_b32 s39, v21, 8
 ; GFX8-NEXT:    v_readlane_b32 s38, v21, 7
 ; GFX8-NEXT:    v_readlane_b32 s37, v21, 6
@@ -983,56 +731,32 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX900-NEXT:    v_writelane_b32 v21, s37, 6
 ; GFX900-NEXT:    v_writelane_b32 v21, s38, 7
 ; GFX900-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v21, s40, 9
-; GFX900-NEXT:    v_writelane_b32 v21, s41, 10
-; GFX900-NEXT:    v_writelane_b32 v21, s42, 11
-; GFX900-NEXT:    v_writelane_b32 v21, s43, 12
-; GFX900-NEXT:    v_writelane_b32 v21, s44, 13
-; GFX900-NEXT:    v_writelane_b32 v21, s45, 14
-; GFX900-NEXT:    v_writelane_b32 v21, s46, 15
-; GFX900-NEXT:    v_writelane_b32 v21, s47, 16
-; GFX900-NEXT:    v_writelane_b32 v21, s48, 17
-; GFX900-NEXT:    v_writelane_b32 v21, s49, 18
-; GFX900-NEXT:    v_writelane_b32 v21, s50, 19
-; GFX900-NEXT:    v_writelane_b32 v21, s51, 20
-; GFX900-NEXT:    v_writelane_b32 v21, s52, 21
-; GFX900-NEXT:    v_writelane_b32 v21, s53, 22
-; GFX900-NEXT:    v_writelane_b32 v21, s54, 23
-; GFX900-NEXT:    v_writelane_b32 v21, s55, 24
-; GFX900-NEXT:    v_writelane_b32 v21, s56, 25
-; GFX900-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX900-NEXT:    v_writelane_b32 v21, s48, 9
+; GFX900-NEXT:    v_writelane_b32 v21, s49, 10
+; GFX900-NEXT:    v_writelane_b32 v21, s50, 11
+; GFX900-NEXT:    v_writelane_b32 v21, s51, 12
+; GFX900-NEXT:    v_writelane_b32 v21, s52, 13
+; GFX900-NEXT:    v_writelane_b32 v21, s53, 14
+; GFX900-NEXT:    v_writelane_b32 v21, s54, 15
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX900-NEXT:    v_writelane_b32 v21, s55, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v22, 6, s32
 ; GFX900-NEXT:    v_add_u32_e32 v22, 16, v22
-; GFX900-NEXT:    v_writelane_b32 v21, s59, 28
 ; GFX900-NEXT:    v_readfirstlane_b32 s59, v22
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v21, 28
-; GFX900-NEXT:    v_readlane_b32 s58, v21, 27
-; GFX900-NEXT:    v_readlane_b32 s57, v21, 26
-; GFX900-NEXT:    v_readlane_b32 s56, v21, 25
-; GFX900-NEXT:    v_readlane_b32 s55, v21, 24
-; GFX900-NEXT:    v_readlane_b32 s54, v21, 23
-; GFX900-NEXT:    v_readlane_b32 s53, v21, 22
-; GFX900-NEXT:    v_readlane_b32 s52, v21, 21
-; GFX900-NEXT:    v_readlane_b32 s51, v21, 20
-; GFX900-NEXT:    v_readlane_b32 s50, v21, 19
-; GFX900-NEXT:    v_readlane_b32 s49, v21, 18
-; GFX900-NEXT:    v_readlane_b32 s48, v21, 17
-; GFX900-NEXT:    v_readlane_b32 s47, v21, 16
-; GFX900-NEXT:    v_readlane_b32 s46, v21, 15
-; GFX900-NEXT:    v_readlane_b32 s45, v21, 14
-; GFX900-NEXT:    v_readlane_b32 s44, v21, 13
-; GFX900-NEXT:    v_readlane_b32 s43, v21, 12
-; GFX900-NEXT:    v_readlane_b32 s42, v21, 11
-; GFX900-NEXT:    v_readlane_b32 s41, v21, 10
-; GFX900-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX900-NEXT:    v_readlane_b32 s55, v21, 16
+; GFX900-NEXT:    v_readlane_b32 s54, v21, 15
+; GFX900-NEXT:    v_readlane_b32 s53, v21, 14
+; GFX900-NEXT:    v_readlane_b32 s52, v21, 13
+; GFX900-NEXT:    v_readlane_b32 s51, v21, 12
+; GFX900-NEXT:    v_readlane_b32 s50, v21, 11
+; GFX900-NEXT:    v_readlane_b32 s49, v21, 10
+; GFX900-NEXT:    v_readlane_b32 s48, v21, 9
 ; GFX900-NEXT:    v_readlane_b32 s39, v21, 8
 ; GFX900-NEXT:    v_readlane_b32 s38, v21, 7
 ; GFX900-NEXT:    v_readlane_b32 s37, v21, 6
@@ -1065,29 +789,15 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX942-NEXT:    v_writelane_b32 v21, s37, 6
 ; GFX942-NEXT:    v_writelane_b32 v21, s38, 7
 ; GFX942-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v21, s40, 9
-; GFX942-NEXT:    v_writelane_b32 v21, s41, 10
-; GFX942-NEXT:    v_writelane_b32 v21, s42, 11
-; GFX942-NEXT:    v_writelane_b32 v21, s43, 12
-; GFX942-NEXT:    v_writelane_b32 v21, s44, 13
-; GFX942-NEXT:    v_writelane_b32 v21, s45, 14
-; GFX942-NEXT:    v_writelane_b32 v21, s46, 15
-; GFX942-NEXT:    v_writelane_b32 v21, s47, 16
-; GFX942-NEXT:    v_writelane_b32 v21, s48, 17
-; GFX942-NEXT:    v_writelane_b32 v21, s49, 18
-; GFX942-NEXT:    v_writelane_b32 v21, s50, 19
-; GFX942-NEXT:    v_writelane_b32 v21, s51, 20
-; GFX942-NEXT:    v_writelane_b32 v21, s52, 21
-; GFX942-NEXT:    v_writelane_b32 v21, s53, 22
-; GFX942-NEXT:    v_writelane_b32 v21, s54, 23
-; GFX942-NEXT:    v_writelane_b32 v21, s55, 24
-; GFX942-NEXT:    v_writelane_b32 v21, s56, 25
-; GFX942-NEXT:    v_writelane_b32 v21, s57, 26
-; GFX942-NEXT:    v_writelane_b32 v21, s58, 27
-; GFX942-NEXT:    v_writelane_b32 v21, s59, 28
-; GFX942-NEXT:    v_writelane_b32 v21, s60, 29
-; GFX942-NEXT:    v_writelane_b32 v21, s61, 30
+; GFX942-NEXT:    v_writelane_b32 v21, s48, 9
+; GFX942-NEXT:    v_writelane_b32 v21, s49, 10
+; GFX942-NEXT:    v_writelane_b32 v21, s50, 11
+; GFX942-NEXT:    v_writelane_b32 v21, s51, 12
+; GFX942-NEXT:    v_writelane_b32 v21, s52, 13
+; GFX942-NEXT:    v_writelane_b32 v21, s53, 14
+; GFX942-NEXT:    v_writelane_b32 v21, s54, 15
 ; GFX942-NEXT:    s_and_b64 s[60:61], 0, exec
+; GFX942-NEXT:    v_writelane_b32 v21, s55, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1098,28 +808,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s61, v21, 30
-; GFX942-NEXT:    v_readlane_b32 s60, v21, 29
-; GFX942-NEXT:    v_readlane_b32 s59, v21, 28
-; GFX942-NEXT:    v_readlane_b32 s58, v21, 27
-; GFX942-NEXT:    v_readlane_b32 s57, v21, 26
-; GFX942-NEXT:    v_readlane_b32 s56, v21, 25
-; GFX942-NEXT:    v_readlane_b32 s55, v21, 24
-; GFX942-NEXT:    v_readlane_b32 s54, v21, 23
-; GFX942-NEXT:    v_readlane_b32 s53, v21, 22
-; GFX942-NEXT:    v_readlane_b32 s52, v21, 21
-; GFX942-NEXT:    v_readlane_b32 s51, v21, 20
-; GFX942-NEXT:    v_readlane_b32 s50, v21, 19
-; GFX942-NEXT:    v_readlane_b32 s49, v21, 18
-; GFX942-NEXT:    v_readlane_b32 s48, v21, 17
-; GFX942-NEXT:    v_readlane_b32 s47, v21, 16
-; GFX942-NEXT:    v_readlane_b32 s46, v21, 15
-; GFX942-NEXT:    v_readlane_b32 s45, v21, 14
-; GFX942-NEXT:    v_readlane_b32 s44, v21, 13
-; GFX942-NEXT:    v_readlane_b32 s43, v21, 12
-; GFX942-NEXT:    v_readlane_b32 s42, v21, 11
-; GFX942-NEXT:    v_readlane_b32 s41, v21, 10
-; GFX942-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX942-NEXT:    v_readlane_b32 s55, v21, 16
+; GFX942-NEXT:    v_readlane_b32 s54, v21, 15
+; GFX942-NEXT:    v_readlane_b32 s53, v21, 14
+; GFX942-NEXT:    v_readlane_b32 s52, v21, 13
+; GFX942-NEXT:    v_readlane_b32 s51, v21, 12
+; GFX942-NEXT:    v_readlane_b32 s50, v21, 11
+; GFX942-NEXT:    v_readlane_b32 s49, v21, 10
+; GFX942-NEXT:    v_readlane_b32 s48, v21, 9
 ; GFX942-NEXT:    v_readlane_b32 s39, v21, 8
 ; GFX942-NEXT:    v_readlane_b32 s38, v21, 7
 ; GFX942-NEXT:    v_readlane_b32 s37, v21, 6
@@ -1145,6 +841,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10_1-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_1-NEXT:    s_and_b32 s59, 0, exec_lo
 ; GFX10_1-NEXT:    v_writelane_b32 v21, s31, 1
 ; GFX10_1-NEXT:    v_writelane_b32 v21, s33, 2
 ; GFX10_1-NEXT:    v_writelane_b32 v21, s34, 3
@@ -1153,56 +850,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_1-NEXT:    v_writelane_b32 v21, s37, 6
 ; GFX10_1-NEXT:    v_writelane_b32 v21, s38, 7
 ; GFX10_1-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v21, s40, 9
-; GFX10_1-NEXT:    v_writelane_b32 v21, s41, 10
-; GFX10_1-NEXT:    v_writelane_b32 v21, s42, 11
-; GFX10_1-NEXT:    v_writelane_b32 v21, s43, 12
-; GFX10_1-NEXT:    v_writelane_b32 v21, s44, 13
-; GFX10_1-NEXT:    v_writelane_b32 v21, s45, 14
-; GFX10_1-NEXT:    v_writelane_b32 v21, s46, 15
-; GFX10_1-NEXT:    v_writelane_b32 v21, s47, 16
-; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 17
-; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 18
-; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 19
-; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 20
-; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 21
-; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 22
-; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 23
-; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 24
-; GFX10_1-NEXT:    v_writelane_b32 v21, s56, 25
-; GFX10_1-NEXT:    v_writelane_b32 v21, s57, 26
-; GFX10_1-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 9
+; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 10
+; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 11
+; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 12
+; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 13
+; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 14
+; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v22, 5, s32
-; GFX10_1-NEXT:    v_writelane_b32 v21, s59, 28
-; GFX10_1-NEXT:    s_and_b32 s59, 0, exec_lo
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v22, 16, v22
 ; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v22
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v21, 28
-; GFX10_1-NEXT:    v_readlane_b32 s58, v21, 27
-; GFX10_1-NEXT:    v_readlane_b32 s57, v21, 26
-; GFX10_1-NEXT:    v_readlane_b32 s56, v21, 25
-; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 24
-; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 23
-; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 22
-; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 21
-; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 20
-; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 19
-; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 18
-; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 17
-; GFX10_1-NEXT:    v_readlane_b32 s47, v21, 16
-; GFX10_1-NEXT:    v_readlane_b32 s46, v21, 15
-; GFX10_1-NEXT:    v_readlane_b32 s45, v21, 14
-; GFX10_1-NEXT:    v_readlane_b32 s44, v21, 13
-; GFX10_1-NEXT:    v_readlane_b32 s43, v21, 12
-; GFX10_1-NEXT:    v_readlane_b32 s42, v21, 11
-; GFX10_1-NEXT:    v_readlane_b32 s41, v21, 10
-; GFX10_1-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 16
+; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 15
+; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 14
+; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 13
+; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 12
+; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 11
+; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 10
+; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 9
 ; GFX10_1-NEXT:    v_readlane_b32 s39, v21, 8
 ; GFX10_1-NEXT:    v_readlane_b32 s38, v21, 7
 ; GFX10_1-NEXT:    v_readlane_b32 s37, v21, 6
@@ -1228,6 +900,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_3-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10_3-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_3-NEXT:    s_and_b32 s59, 0, exec_lo
 ; GFX10_3-NEXT:    v_writelane_b32 v21, s31, 1
 ; GFX10_3-NEXT:    v_writelane_b32 v21, s33, 2
 ; GFX10_3-NEXT:    v_writelane_b32 v21, s34, 3
@@ -1236,56 +909,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_3-NEXT:    v_writelane_b32 v21, s37, 6
 ; GFX10_3-NEXT:    v_writelane_b32 v21, s38, 7
 ; GFX10_3-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v21, s40, 9
-; GFX10_3-NEXT:    v_writelane_b32 v21, s41, 10
-; GFX10_3-NEXT:    v_writelane_b32 v21, s42, 11
-; GFX10_3-NEXT:    v_writelane_b32 v21, s43, 12
-; GFX10_3-NEXT:    v_writelane_b32 v21, s44, 13
-; GFX10_3-NEXT:    v_writelane_b32 v21, s45, 14
-; GFX10_3-NEXT:    v_writelane_b32 v21, s46, 15
-; GFX10_3-NEXT:    v_writelane_b32 v21, s47, 16
-; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 17
-; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 18
-; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 19
-; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 20
-; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 21
-; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 22
-; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 23
-; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 24
-; GFX10_3-NEXT:    v_writelane_b32 v21, s56, 25
-; GFX10_3-NEXT:    v_writelane_b32 v21, s57, 26
-; GFX10_3-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 9
+; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 10
+; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 11
+; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 12
+; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 13
+; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 14
+; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v22, 5, s32
-; GFX10_3-NEXT:    v_writelane_b32 v21, s59, 28
-; GFX10_3-NEXT:    s_and_b32 s59, 0, exec_lo
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v22, 16, v22
 ; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v22
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v21, 28
-; GFX10_3-NEXT:    v_readlane_b32 s58, v21, 27
-; GFX10_3-NEXT:    v_readlane_b32 s57, v21, 26
-; GFX10_3-NEXT:    v_readlane_b32 s56, v21, 25
-; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 24
-; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 23
-; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 22
-; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 21
-; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 20
-; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 19
-; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 18
-; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 17
-; GFX10_3-NEXT:    v_readlane_b32 s47, v21, 16
-; GFX10_3-NEXT:    v_readlane_b32 s46, v21, 15
-; GFX10_3-NEXT:    v_readlane_b32 s45, v21, 14
-; GFX10_3-NEXT:    v_readlane_b32 s44, v21, 13
-; GFX10_3-NEXT:    v_readlane_b32 s43, v21, 12
-; GFX10_3-NEXT:    v_readlane_b32 s42, v21, 11
-; GFX10_3-NEXT:    v_readlane_b32 s41, v21, 10
-; GFX10_3-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 16
+; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 15
+; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 14
+; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 13
+; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 12
+; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 11
+; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 10
+; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 9
 ; GFX10_3-NEXT:    v_readlane_b32 s39, v21, 8
 ; GFX10_3-NEXT:    v_readlane_b32 s38, v21, 7
 ; GFX10_3-NEXT:    v_readlane_b32 s37, v21, 6
@@ -1310,6 +958,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX11-NEXT:    scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX11-NEXT:    s_and_b32 s59, 0, exec_lo
 ; GFX11-NEXT:    v_writelane_b32 v21, s31, 1
 ; GFX11-NEXT:    v_writelane_b32 v21, s33, 2
 ; GFX11-NEXT:    v_writelane_b32 v21, s34, 3
@@ -1318,62 +967,33 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX11-NEXT:    v_writelane_b32 v21, s37, 6
 ; GFX11-NEXT:    v_writelane_b32 v21, s38, 7
 ; GFX11-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v21, s40, 9
-; GFX11-NEXT:    v_writelane_b32 v21, s41, 10
-; GFX11-NEXT:    v_writelane_b32 v21, s42, 11
-; GFX11-NEXT:    v_writelane_b32 v21, s43, 12
-; GFX11-NEXT:    v_writelane_b32 v21, s44, 13
-; GFX11-NEXT:    v_writelane_b32 v21, s45, 14
-; GFX11-NEXT:    v_writelane_b32 v21, s46, 15
-; GFX11-NEXT:    v_writelane_b32 v21, s47, 16
-; GFX11-NEXT:    v_writelane_b32 v21, s48, 17
-; GFX11-NEXT:    v_writelane_b32 v21, s49, 18
-; GFX11-NEXT:    v_writelane_b32 v21, s50, 19
-; GFX11-NEXT:    v_writelane_b32 v21, s51, 20
-; GFX11-NEXT:    v_writelane_b32 v21, s52, 21
-; GFX11-NEXT:    v_writelane_b32 v21, s53, 22
-; GFX11-NEXT:    v_writelane_b32 v21, s54, 23
-; GFX11-NEXT:    v_writelane_b32 v21, s55, 24
-; GFX11-NEXT:    v_writelane_b32 v21, s56, 25
-; GFX11-NEXT:    v_writelane_b32 v21, s57, 26
-; GFX11-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX11-NEXT:    v_writelane_b32 v21, s48, 9
+; GFX11-NEXT:    v_writelane_b32 v21, s49, 10
+; GFX11-NEXT:    v_writelane_b32 v21, s50, 11
+; GFX11-NEXT:    v_writelane_b32 v21, s51, 12
+; GFX11-NEXT:    v_writelane_b32 v21, s52, 13
+; GFX11-NEXT:    v_writelane_b32 v21, s53, 14
+; GFX11-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX11-NEXT:    v_writelane_b32 v21, s55, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v21, s59, 28
-; GFX11-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX11-NEXT:    s_addc_u32 s32, s32, 16
+; GFX11-NEXT:    s_addc_u32 s60, s32, 16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
-; GFX11-NEXT:    s_bitset0_b32 s32, 0
-; GFX11-NEXT:    s_mov_b32 s59, s32
-; GFX11-NEXT:    s_addc_u32 s32, s32, -16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
-; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    s_bitcmp1_b32 s60, 0
+; GFX11-NEXT:    s_bitset0_b32 s60, 0
+; GFX11-NEXT:    s_mov_b32 s59, s60
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v21, 28
-; GFX11-NEXT:    v_readlane_b32 s58, v21, 27
-; GFX11-NEXT:    v_readlane_b32 s57, v21, 26
-; GFX11-NEXT:    v_readlane_b32 s56, v21, 25
-; GFX11-NEXT:    v_readlane_b32 s55, v21, 24
-; GFX11-NEXT:    v_readlane_b32 s54, v21, 23
-; GFX11-NEXT:    v_readlane_b32 s53, v21, 22
-; GFX11-NEXT:    v_readlane_b32 s52, v21, 21
-; GFX11-NEXT:    v_readlane_b32 s51, v21, 20
-; GFX11-NEXT:    v_readlane_b32 s50, v21, 19
-; GFX11-NEXT:    v_readlane_b32 s49, v21, 18
-; GFX11-NEXT:    v_readlane_b32 s48, v21, 17
-; GFX11-NEXT:    v_readlane_b32 s47, v21, 16
-; GFX11-NEXT:    v_readlane_b32 s46, v21, 15
-; GFX11-NEXT:    v_readlane_b32 s45, v21, 14
-; GFX11-NEXT:    v_readlane_b32 s44, v21, 13
-; GFX11-NEXT:    v_readlane_b32 s43, v21, 12
-; GFX11-NEXT:    v_readlane_b32 s42, v21, 11
-; GFX11-NEXT:    v_readlane_b32 s41, v21, 10
-; GFX11-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX11-NEXT:    v_readlane_b32 s55, v21, 16
+; GFX11-NEXT:    v_readlane_b32 s54, v21, 15
+; GFX11-NEXT:    v_readlane_b32 s53, v21, 14
+; GFX11-NEXT:    v_readlane_b32 s52, v21, 13
+; GFX11-NEXT:    v_readlane_b32 s51, v21, 12
+; GFX11-NEXT:    v_readlane_b32 s50, v21, 11
+; GFX11-NEXT:    v_readlane_b32 s49, v21, 10
+; GFX11-NEXT:    v_readlane_b32 s48, v21, 9
 ; GFX11-NEXT:    v_readlane_b32 s39, v21, 8
 ; GFX11-NEXT:    v_readlane_b32 s38, v21, 7
 ; GFX11-NEXT:    v_readlane_b32 s37, v21, 6
@@ -1402,6 +1022,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX12-NEXT:    s_and_b32 s59, 0, exec_lo
 ; GFX12-NEXT:    v_writelane_b32 v21, s31, 1
 ; GFX12-NEXT:    v_writelane_b32 v21, s33, 2
 ; GFX12-NEXT:    v_writelane_b32 v21, s34, 3
@@ -1410,55 +1031,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    v_writelane_b32 v21, s37, 6
 ; GFX12-NEXT:    v_writelane_b32 v21, s38, 7
 ; GFX12-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v21, s40, 9
-; GFX12-NEXT:    v_writelane_b32 v21, s41, 10
-; GFX12-NEXT:    v_writelane_b32 v21, s42, 11
-; GFX12-NEXT:    v_writelane_b32 v21, s43, 12
-; GFX12-NEXT:    v_writelane_b32 v21, s44, 13
-; GFX12-NEXT:    v_writelane_b32 v21, s45, 14
-; GFX12-NEXT:    v_writelane_b32 v21, s46, 15
-; GFX12-NEXT:    v_writelane_b32 v21, s47, 16
-; GFX12-NEXT:    v_writelane_b32 v21, s48, 17
-; GFX12-NEXT:    v_writelane_b32 v21, s49, 18
-; GFX12-NEXT:    v_writelane_b32 v21, s50, 19
-; GFX12-NEXT:    v_writelane_b32 v21, s51, 20
-; GFX12-NEXT:    v_writelane_b32 v21, s52, 21
-; GFX12-NEXT:    v_writelane_b32 v21, s53, 22
-; GFX12-NEXT:    v_writelane_b32 v21, s54, 23
-; GFX12-NEXT:    v_writelane_b32 v21, s55, 24
-; GFX12-NEXT:    v_writelane_b32 v21, s56, 25
-; GFX12-NEXT:    v_writelane_b32 v21, s57, 26
-; GFX12-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX12-NEXT:    v_writelane_b32 v21, s48, 9
+; GFX12-NEXT:    v_writelane_b32 v21, s49, 10
+; GFX12-NEXT:    v_writelane_b32 v21, s50, 11
+; GFX12-NEXT:    v_writelane_b32 v21, s51, 12
+; GFX12-NEXT:    v_writelane_b32 v21, s52, 13
+; GFX12-NEXT:    v_writelane_b32 v21, s53, 14
+; GFX12-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX12-NEXT:    v_writelane_b32 v21, s55, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_writelane_b32 v21, s59, 28
-; GFX12-NEXT:    s_and_b32 s59, 0, exec_lo
 ; GFX12-NEXT:    s_mov_b32 s59, s32
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readlane_b32 s59, v21, 28
-; GFX12-NEXT:    v_readlane_b32 s58, v21, 27
-; GFX12-NEXT:    v_readlane_b32 s57, v21, 26
-; GFX12-NEXT:    v_readlane_b32 s56, v21, 25
-; GFX12-NEXT:    v_readlane_b32 s55, v21, 24
-; GFX12-NEXT:    v_readlane_b32 s54, v21, 23
-; GFX12-NEXT:    v_readlane_b32 s53, v21, 22
-; GFX12-NEXT:    v_readlane_b32 s52, v21, 21
-; GFX12-NEXT:    v_readlane_b32 s51, v21, 20
-; GFX12-NEXT:    v_readlane_b32 s50, v21, 19
-; GFX12-NEXT:    v_readlane_b32 s49, v21, 18
-; GFX12-NEXT:    v_readlane_b32 s48, v21, 17
-; GFX12-NEXT:    v_readlane_b32 s47, v21, 16
-; GFX12-NEXT:    v_readlane_b32 s46, v21, 15
-; GFX12-NEXT:    v_readlane_b32 s45, v21, 14
-; GFX12-NEXT:    v_readlane_b32 s44, v21, 13
-; GFX12-NEXT:    v_readlane_b32 s43, v21, 12
-; GFX12-NEXT:    v_readlane_b32 s42, v21, 11
-; GFX12-NEXT:    v_readlane_b32 s41, v21, 10
-; GFX12-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX12-NEXT:    v_readlane_b32 s55, v21, 16
+; GFX12-NEXT:    v_readlane_b32 s54, v21, 15
+; GFX12-NEXT:    v_readlane_b32 s53, v21, 14
+; GFX12-NEXT:    v_readlane_b32 s52, v21, 13
+; GFX12-NEXT:    v_readlane_b32 s51, v21, 12
+; GFX12-NEXT:    v_readlane_b32 s50, v21, 11
+; GFX12-NEXT:    v_readlane_b32 s49, v21, 10
+; GFX12-NEXT:    v_readlane_b32 s48, v21, 9
 ; GFX12-NEXT:    v_readlane_b32 s39, v21, 8
 ; GFX12-NEXT:    v_readlane_b32 s38, v21, 7
 ; GFX12-NEXT:    v_readlane_b32 s37, v21, 6
@@ -1523,8 +1119,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x201100
 ; GFX7-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    v_writelane_b32 v23, s28, 28
-; GFX7-NEXT:    v_writelane_b32 v23, s29, 29
+; GFX7-NEXT:    v_writelane_b32 v23, s28, 17
+; GFX7-NEXT:    v_writelane_b32 v23, s29, 18
 ; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
 ; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
 ; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
@@ -1534,32 +1130,21 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
 ; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
 ; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v23, s40, 9
-; GFX7-NEXT:    v_writelane_b32 v23, s41, 10
-; GFX7-NEXT:    v_writelane_b32 v23, s42, 11
-; GFX7-NEXT:    v_writelane_b32 v23, s43, 12
-; GFX7-NEXT:    v_writelane_b32 v23, s44, 13
-; GFX7-NEXT:    v_writelane_b32 v23, s45, 14
-; GFX7-NEXT:    v_writelane_b32 v23, s46, 15
-; GFX7-NEXT:    v_writelane_b32 v23, s47, 16
-; GFX7-NEXT:    v_writelane_b32 v23, s48, 17
-; GFX7-NEXT:    v_writelane_b32 v23, s49, 18
-; GFX7-NEXT:    v_writelane_b32 v23, s50, 19
-; GFX7-NEXT:    v_writelane_b32 v23, s51, 20
-; GFX7-NEXT:    v_writelane_b32 v23, s52, 21
-; GFX7-NEXT:    v_writelane_b32 v23, s53, 22
-; GFX7-NEXT:    v_writelane_b32 v23, s54, 23
-; GFX7-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 13
 ; GFX7-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 14
 ; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
 ; GFX7-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX7-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX7-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 15
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
 ; GFX7-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use alloca0 v0
 ; GFX7-NEXT:    ;;#ASMEND
@@ -1570,25 +1155,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s59, v23, 27
-; GFX7-NEXT:    v_readlane_b32 s57, v23, 26
-; GFX7-NEXT:    v_readlane_b32 s56, v23, 25
-; GFX7-NEXT:    v_readlane_b32 s55, v23, 24
-; GFX7-NEXT:    v_readlane_b32 s54, v23, 23
-; GFX7-NEXT:    v_readlane_b32 s53, v23, 22
-; GFX7-NEXT:    v_readlane_b32 s52, v23, 21
-; GFX7-NEXT:    v_readlane_b32 s51, v23, 20
-; GFX7-NEXT:    v_readlane_b32 s50, v23, 19
-; GFX7-NEXT:    v_readlane_b32 s49, v23, 18
-; GFX7-NEXT:    v_readlane_b32 s48, v23, 17
-; GFX7-NEXT:    v_readlane_b32 s47, v23, 16
-; GFX7-NEXT:    v_readlane_b32 s46, v23, 15
-; GFX7-NEXT:    v_readlane_b32 s45, v23, 14
-; GFX7-NEXT:    v_readlane_b32 s44, v23, 13
-; GFX7-NEXT:    v_readlane_b32 s43, v23, 12
-; GFX7-NEXT:    v_readlane_b32 s42, v23, 11
-; GFX7-NEXT:    v_readlane_b32 s41, v23, 10
-; GFX7-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 9
 ; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
 ; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
 ; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
@@ -1598,8 +1172,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
 ; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
-; GFX7-NEXT:    v_readlane_b32 s28, v23, 28
-; GFX7-NEXT:    v_readlane_b32 s29, v23, 29
+; GFX7-NEXT:    v_readlane_b32 s28, v23, 17
+; GFX7-NEXT:    v_readlane_b32 s29, v23, 18
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x201000
 ; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1625,30 +1199,19 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    v_writelane_b32 v22, s37, 6
 ; GFX8-NEXT:    v_writelane_b32 v22, s38, 7
 ; GFX8-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v22, s40, 9
-; GFX8-NEXT:    v_writelane_b32 v22, s41, 10
-; GFX8-NEXT:    v_writelane_b32 v22, s42, 11
-; GFX8-NEXT:    v_writelane_b32 v22, s43, 12
-; GFX8-NEXT:    v_writelane_b32 v22, s44, 13
-; GFX8-NEXT:    v_writelane_b32 v22, s45, 14
-; GFX8-NEXT:    v_writelane_b32 v22, s46, 15
-; GFX8-NEXT:    v_writelane_b32 v22, s47, 16
-; GFX8-NEXT:    v_writelane_b32 v22, s48, 17
-; GFX8-NEXT:    v_writelane_b32 v22, s49, 18
-; GFX8-NEXT:    v_writelane_b32 v22, s50, 19
-; GFX8-NEXT:    v_writelane_b32 v22, s51, 20
-; GFX8-NEXT:    v_writelane_b32 v22, s52, 21
-; GFX8-NEXT:    v_writelane_b32 v22, s53, 22
-; GFX8-NEXT:    v_writelane_b32 v22, s54, 23
-; GFX8-NEXT:    v_writelane_b32 v22, s55, 24
-; GFX8-NEXT:    v_writelane_b32 v22, s56, 25
-; GFX8-NEXT:    v_writelane_b32 v22, s57, 26
+; GFX8-NEXT:    v_writelane_b32 v22, s48, 9
+; GFX8-NEXT:    v_writelane_b32 v22, s49, 10
+; GFX8-NEXT:    v_writelane_b32 v22, s50, 11
+; GFX8-NEXT:    v_writelane_b32 v22, s51, 12
+; GFX8-NEXT:    v_writelane_b32 v22, s52, 13
 ; GFX8-NEXT:    s_lshr_b32 s4, s32, 6
-; GFX8-NEXT:    v_writelane_b32 v22, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v22, s53, 14
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX8-NEXT:    s_add_i32 s59, s4, 0x4240
+; GFX8-NEXT:    v_writelane_b32 v22, s54, 15
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
@@ -1658,25 +1221,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s59, v22, 27
-; GFX8-NEXT:    v_readlane_b32 s57, v22, 26
-; GFX8-NEXT:    v_readlane_b32 s56, v22, 25
-; GFX8-NEXT:    v_readlane_b32 s55, v22, 24
-; GFX8-NEXT:    v_readlane_b32 s54, v22, 23
-; GFX8-NEXT:    v_readlane_b32 s53, v22, 22
-; GFX8-NEXT:    v_readlane_b32 s52, v22, 21
-; GFX8-NEXT:    v_readlane_b32 s51, v22, 20
-; GFX8-NEXT:    v_readlane_b32 s50, v22, 19
-; GFX8-NEXT:    v_readlane_b32 s49, v22, 18
-; GFX8-NEXT:    v_readlane_b32 s48, v22, 17
-; GFX8-NEXT:    v_readlane_b32 s47, v22, 16
-; GFX8-NEXT:    v_readlane_b32 s46, v22, 15
-; GFX8-NEXT:    v_readlane_b32 s45, v22, 14
-; GFX8-NEXT:    v_readlane_b32 s44, v22, 13
-; GFX8-NEXT:    v_readlane_b32 s43, v22, 12
-; GFX8-NEXT:    v_readlane_b32 s42, v22, 11
-; GFX8-NEXT:    v_readlane_b32 s41, v22, 10
-; GFX8-NEXT:    v_readlane_b32 s40, v22, 9
+; GFX8-NEXT:    v_readlane_b32 s55, v22, 16
+; GFX8-NEXT:    v_readlane_b32 s54, v22, 15
+; GFX8-NEXT:    v_readlane_b32 s53, v22, 14
+; GFX8-NEXT:    v_readlane_b32 s52, v22, 13
+; GFX8-NEXT:    v_readlane_b32 s51, v22, 12
+; GFX8-NEXT:    v_readlane_b32 s50, v22, 11
+; GFX8-NEXT:    v_readlane_b32 s49, v22, 10
+; GFX8-NEXT:    v_readlane_b32 s48, v22, 9
 ; GFX8-NEXT:    v_readlane_b32 s39, v22, 8
 ; GFX8-NEXT:    v_readlane_b32 s38, v22, 7
 ; GFX8-NEXT:    v_readlane_b32 s37, v22, 6
@@ -1709,30 +1261,19 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX900-NEXT:    v_writelane_b32 v22, s37, 6
 ; GFX900-NEXT:    v_writelane_b32 v22, s38, 7
 ; GFX900-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v22, s40, 9
-; GFX900-NEXT:    v_writelane_b32 v22, s41, 10
-; GFX900-NEXT:    v_writelane_b32 v22, s42, 11
-; GFX900-NEXT:    v_writelane_b32 v22, s43, 12
-; GFX900-NEXT:    v_writelane_b32 v22, s44, 13
-; GFX900-NEXT:    v_writelane_b32 v22, s45, 14
-; GFX900-NEXT:    v_writelane_b32 v22, s46, 15
-; GFX900-NEXT:    v_writelane_b32 v22, s47, 16
-; GFX900-NEXT:    v_writelane_b32 v22, s48, 17
-; GFX900-NEXT:    v_writelane_b32 v22, s49, 18
-; GFX900-NEXT:    v_writelane_b32 v22, s50, 19
-; GFX900-NEXT:    v_writelane_b32 v22, s51, 20
-; GFX900-NEXT:    v_writelane_b32 v22, s52, 21
-; GFX900-NEXT:    v_writelane_b32 v22, s53, 22
-; GFX900-NEXT:    v_writelane_b32 v22, s54, 23
-; GFX900-NEXT:    v_writelane_b32 v22, s55, 24
-; GFX900-NEXT:    v_writelane_b32 v22, s56, 25
-; GFX900-NEXT:    v_writelane_b32 v22, s57, 26
+; GFX900-NEXT:    v_writelane_b32 v22, s48, 9
+; GFX900-NEXT:    v_writelane_b32 v22, s49, 10
+; GFX900-NEXT:    v_writelane_b32 v22, s50, 11
+; GFX900-NEXT:    v_writelane_b32 v22, s51, 12
+; GFX900-NEXT:    v_writelane_b32 v22, s52, 13
 ; GFX900-NEXT:    s_lshr_b32 s4, s32, 6
-; GFX900-NEXT:    v_writelane_b32 v22, s59, 27
+; GFX900-NEXT:    v_writelane_b32 v22, s53, 14
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    s_add_i32 s59, s4, 0x4240
+; GFX900-NEXT:    v_writelane_b32 v22, s54, 15
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
@@ -1742,25 +1283,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s59, v22, 27
-; GFX900-NEXT:    v_readlane_b32 s57, v22, 26
-; GFX900-NEXT:    v_readlane_b32 s56, v22, 25
-; GFX900-NEXT:    v_readlane_b32 s55, v22, 24
-; GFX900-NEXT:    v_readlane_b32 s54, v22, 23
-; GFX900-NEXT:    v_readlane_b32 s53, v22, 22
-; GFX900-NEXT:    v_readlane_b32 s52, v22, 21
-; GFX900-NEXT:    v_readlane_b32 s51, v22, 20
-; GFX900-NEXT:    v_readlane_b32 s50, v22, 19
-; GFX900-NEXT:    v_readlane_b32 s49, v22, 18
-; GFX900-NEXT:    v_readlane_b32 s48, v22, 17
-; GFX900-NEXT:    v_readlane_b32 s47, v22, 16
-; GFX900-NEXT:    v_readlane_b32 s46, v22, 15
-; GFX900-NEXT:    v_readlane_b32 s45, v22, 14
-; GFX900-NEXT:    v_readlane_b32 s44, v22, 13
-; GFX900-NEXT:    v_readlane_b32 s43, v22, 12
-; GFX900-NEXT:    v_readlane_b32 s42, v22, 11
-; GFX900-NEXT:    v_readlane_b32 s41, v22, 10
-; GFX900-NEXT:    v_readlane_b32 s40, v22, 9
+; GFX900-NEXT:    v_readlane_b32 s55, v22, 16
+; GFX900-NEXT:    v_readlane_b32 s54, v22, 15
+; GFX900-NEXT:    v_readlane_b32 s53, v22, 14
+; GFX900-NEXT:    v_readlane_b32 s52, v22, 13
+; GFX900-NEXT:    v_readlane_b32 s51, v22, 12
+; GFX900-NEXT:    v_readlane_b32 s50, v22, 11
+; GFX900-NEXT:    v_readlane_b32 s49, v22, 10
+; GFX900-NEXT:    v_readlane_b32 s48, v22, 9
 ; GFX900-NEXT:    v_readlane_b32 s39, v22, 8
 ; GFX900-NEXT:    v_readlane_b32 s38, v22, 7
 ; GFX900-NEXT:    v_readlane_b32 s37, v22, 6
@@ -1793,28 +1323,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX942-NEXT:    v_writelane_b32 v22, s37, 6
 ; GFX942-NEXT:    v_writelane_b32 v22, s38, 7
 ; GFX942-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v22, s40, 9
-; GFX942-NEXT:    v_writelane_b32 v22, s41, 10
-; GFX942-NEXT:    v_writelane_b32 v22, s42, 11
-; GFX942-NEXT:    v_writelane_b32 v22, s43, 12
-; GFX942-NEXT:    v_writelane_b32 v22, s44, 13
-; GFX942-NEXT:    v_writelane_b32 v22, s45, 14
-; GFX942-NEXT:    v_writelane_b32 v22, s46, 15
-; GFX942-NEXT:    v_writelane_b32 v22, s47, 16
-; GFX942-NEXT:    v_writelane_b32 v22, s48, 17
-; GFX942-NEXT:    v_writelane_b32 v22, s49, 18
-; GFX942-NEXT:    v_writelane_b32 v22, s50, 19
-; GFX942-NEXT:    v_writelane_b32 v22, s51, 20
-; GFX942-NEXT:    v_writelane_b32 v22, s52, 21
-; GFX942-NEXT:    v_writelane_b32 v22, s53, 22
-; GFX942-NEXT:    v_writelane_b32 v22, s54, 23
-; GFX942-NEXT:    v_writelane_b32 v22, s55, 24
-; GFX942-NEXT:    v_writelane_b32 v22, s56, 25
-; GFX942-NEXT:    v_writelane_b32 v22, s57, 26
+; GFX942-NEXT:    v_writelane_b32 v22, s48, 9
+; GFX942-NEXT:    v_writelane_b32 v22, s49, 10
+; GFX942-NEXT:    v_writelane_b32 v22, s50, 11
+; GFX942-NEXT:    v_writelane_b32 v22, s51, 12
+; GFX942-NEXT:    v_writelane_b32 v22, s52, 13
+; GFX942-NEXT:    v_writelane_b32 v22, s53, 14
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
-; GFX942-NEXT:    v_writelane_b32 v22, s59, 27
+; GFX942-NEXT:    v_writelane_b32 v22, s54, 15
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
-; GFX942-NEXT:    v_writelane_b32 v22, s60, 28
+; GFX942-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1822,32 +1340,18 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX942-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_add_i32 s59, s32, 0x4240
-; GFX942-NEXT:    v_writelane_b32 v22, s61, 29
 ; GFX942-NEXT:    s_and_b64 s[60:61], 0, exec
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s61, v22, 29
-; GFX942-NEXT:    v_readlane_b32 s60, v22, 28
-; GFX942-NEXT:    v_readlane_b32 s59, v22, 27
-; GFX942-NEXT:    v_readlane_b32 s57, v22, 26
-; GFX942-NEXT:    v_readlane_b32 s56, v22, 25
-; GFX942-NEXT:    v_readlane_b32 s55, v22, 24
-; GFX942-NEXT:    v_readlane_b32 s54, v22, 23
-; GFX942-NEXT:    v_readlane_b32 s53, v22, 22
-; GFX942-NEXT:    v_readlane_b32 s52, v22, 21
-; GFX942-NEXT:    v_readlane_b32 s51, v22, 20
-; GFX942-NEXT:    v_readlane_b32 s50, v22, 19
-; GFX942-NEXT:    v_readlane_b32 s49, v22, 18
-; GFX942-NEXT:    v_readlane_b32 s48, v22, 17
-; GFX942-NEXT:    v_readlane_b32 s47, v22, 16
-; GFX942-NEXT:    v_readlane_b32 s46, v22, 15
-; GFX942-NEXT:    v_readlane_b32 s45, v22, 14
-; GFX942-NEXT:    v_readlane_b32 s44, v22, 13
-; GFX942-NEXT:    v_readlane_b32 s43, v22, 12
-; GFX942-NEXT:    v_readlane_b32 s42, v22, 11
-; GFX942-NEXT:    v_readlane_b32 s41, v22, 10
-; GFX942-NEXT:    v_readlane_b32 s40, v22, 9
+; GFX942-NEXT:    v_readlane_b32 s55, v22, 16
+; GFX942-NEXT:    v_readlane_b32 s54, v22, 15
+; GFX942-NEXT:    v_readlane_b32 s53, v22, 14
+; GFX942-NEXT:    v_readlane_b32 s52, v22, 13
+; GFX942-NEXT:    v_readlane_b32 s51, v22, 12
+; GFX942-NEXT:    v_readlane_b32 s50, v22, 11
+; GFX942-NEXT:    v_readlane_b32 s49, v22, 10
+; GFX942-NEXT:    v_readlane_b32 s48, v22, 9
 ; GFX942-NEXT:    v_readlane_b32 s39, v22, 8
 ; GFX942-NEXT:    v_readlane_b32 s38, v22, 7
 ; GFX942-NEXT:    v_readlane_b32 s37, v22, 6
@@ -1875,8 +1379,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_1-NEXT:    v_writelane_b32 v22, s30, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_lshr_b32 s4, s32, 5
+; GFX10_1-NEXT:    s_add_i32 s59, s4, 0x4240
 ; GFX10_1-NEXT:    v_writelane_b32 v22, s31, 1
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -1887,52 +1393,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_1-NEXT:    v_writelane_b32 v22, s37, 6
 ; GFX10_1-NEXT:    v_writelane_b32 v22, s38, 7
 ; GFX10_1-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v22, s40, 9
-; GFX10_1-NEXT:    v_writelane_b32 v22, s41, 10
-; GFX10_1-NEXT:    v_writelane_b32 v22, s42, 11
-; GFX10_1-NEXT:    v_writelane_b32 v22, s43, 12
-; GFX10_1-NEXT:    v_writelane_b32 v22, s44, 13
-; GFX10_1-NEXT:    v_writelane_b32 v22, s45, 14
-; GFX10_1-NEXT:    v_writelane_b32 v22, s46, 15
-; GFX10_1-NEXT:    v_writelane_b32 v22, s47, 16
-; GFX10_1-NEXT:    v_writelane_b32 v22, s48, 17
-; GFX10_1-NEXT:    v_writelane_b32 v22, s49, 18
-; GFX10_1-NEXT:    v_writelane_b32 v22, s50, 19
-; GFX10_1-NEXT:    v_writelane_b32 v22, s51, 20
-; GFX10_1-NEXT:    v_writelane_b32 v22, s52, 21
-; GFX10_1-NEXT:    v_writelane_b32 v22, s53, 22
-; GFX10_1-NEXT:    v_writelane_b32 v22, s54, 23
-; GFX10_1-NEXT:    v_writelane_b32 v22, s55, 24
-; GFX10_1-NEXT:    v_writelane_b32 v22, s56, 25
-; GFX10_1-NEXT:    v_writelane_b32 v22, s57, 26
-; GFX10_1-NEXT:    v_writelane_b32 v22, s59, 27
-; GFX10_1-NEXT:    s_add_i32 s59, s4, 0x4240
-; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_writelane_b32 v22, s48, 9
+; GFX10_1-NEXT:    v_writelane_b32 v22, s49, 10
+; GFX10_1-NEXT:    v_writelane_b32 v22, s50, 11
+; GFX10_1-NEXT:    v_writelane_b32 v22, s51, 12
+; GFX10_1-NEXT:    v_writelane_b32 v22, s52, 13
+; GFX10_1-NEXT:    v_writelane_b32 v22, s53, 14
+; GFX10_1-NEXT:    v_writelane_b32 v22, s54, 15
+; GFX10_1-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s59, v22, 27
-; GFX10_1-NEXT:    v_readlane_b32 s57, v22, 26
-; GFX10_1-NEXT:    v_readlane_b32 s56, v22, 25
-; GFX10_1-NEXT:    v_readlane_b32 s55, v22, 24
-; GFX10_1-NEXT:    v_readlane_b32 s54, v22, 23
-; GFX10_1-NEXT:    v_readlane_b32 s53, v22, 22
-; GFX10_1-NEXT:    v_readlane_b32 s52, v22, 21
-; GFX10_1-NEXT:    v_readlane_b32 s51, v22, 20
-; GFX10_1-NEXT:    v_readlane_b32 s50, v22, 19
-; GFX10_1-NEXT:    v_readlane_b32 s49, v22, 18
-; GFX10_1-NEXT:    v_readlane_b32 s48, v22, 17
-; GFX10_1-NEXT:    v_readlane_b32 s47, v22, 16
-; GFX10_1-NEXT:    v_readlane_b32 s46, v22, 15
-; GFX10_1-NEXT:    v_readlane_b32 s45, v22, 14
-; GFX10_1-NEXT:    v_readlane_b32 s44, v22, 13
-; GFX10_1-NEXT:    v_readlane_b32 s43, v22, 12
-; GFX10_1-NEXT:    v_readlane_b32 s42, v22, 11
-; GFX10_1-NEXT:    v_readlane_b32 s41, v22, 10
-; GFX10_1-NEXT:    v_readlane_b32 s40, v22, 9
+; GFX10_1-NEXT:    v_readlane_b32 s55, v22, 16
+; GFX10_1-NEXT:    v_readlane_b32 s54, v22, 15
+; GFX10_1-NEXT:    v_readlane_b32 s53, v22, 14
+; GFX10_1-NEXT:    v_readlane_b32 s52, v22, 13
+; GFX10_1-NEXT:    v_readlane_b32 s51, v22, 12
+; GFX10_1-NEXT:    v_readlane_b32 s50, v22, 11
+; GFX10_1-NEXT:    v_readlane_b32 s49, v22, 10
+; GFX10_1-NEXT:    v_readlane_b32 s48, v22, 9
 ; GFX10_1-NEXT:    v_readlane_b32 s39, v22, 8
 ; GFX10_1-NEXT:    v_readlane_b32 s38, v22, 7
 ; GFX10_1-NEXT:    v_readlane_b32 s37, v22, 6
@@ -1960,8 +1442,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_3-NEXT:    v_writelane_b32 v22, s30, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_lshr_b32 s4, s32, 5
+; GFX10_3-NEXT:    s_add_i32 s59, s4, 0x4240
 ; GFX10_3-NEXT:    v_writelane_b32 v22, s31, 1
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -1972,52 +1456,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_3-NEXT:    v_writelane_b32 v22, s37, 6
 ; GFX10_3-NEXT:    v_writelane_b32 v22, s38, 7
 ; GFX10_3-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v22, s40, 9
-; GFX10_3-NEXT:    v_writelane_b32 v22, s41, 10
-; GFX10_3-NEXT:    v_writelane_b32 v22, s42, 11
-; GFX10_3-NEXT:    v_writelane_b32 v22, s43, 12
-; GFX10_3-NEXT:    v_writelane_b32 v22, s44, 13
-; GFX10_3-NEXT:    v_writelane_b32 v22, s45, 14
-; GFX10_3-NEXT:    v_writelane_b32 v22, s46, 15
-; GFX10_3-NEXT:    v_writelane_b32 v22, s47, 16
-; GFX10_3-NEXT:    v_writelane_b32 v22, s48, 17
-; GFX10_3-NEXT:    v_writelane_b32 v22, s49, 18
-; GFX10_3-NEXT:    v_writelane_b32 v22, s50, 19
-; GFX10_3-NEXT:    v_writelane_b32 v22, s51, 20
-; GFX10_3-NEXT:    v_writelane_b32 v22, s52, 21
-; GFX10_3-NEXT:    v_writelane_b32 v22, s53, 22
-; GFX10_3-NEXT:    v_writelane_b32 v22, s54, 23
-; GFX10_3-NEXT:    v_writelane_b32 v22, s55, 24
-; GFX10_3-NEXT:    v_writelane_b32 v22, s56, 25
-; GFX10_3-NEXT:    v_writelane_b32 v22, s57, 26
-; GFX10_3-NEXT:    v_writelane_b32 v22, s59, 27
-; GFX10_3-NEXT:    s_add_i32 s59, s4, 0x4240
-; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_writelane_b32 v22, s48, 9
+; GFX10_3-NEXT:    v_writelane_b32 v22, s49, 10
+; GFX10_3-NEXT:    v_writelane_b32 v22, s50, 11
+; GFX10_3-NEXT:    v_writelane_b32 v22, s51, 12
+; GFX10_3-NEXT:    v_writelane_b32 v22, s52, 13
+; GFX10_3-NEXT:    v_writelane_b32 v22, s53, 14
+; GFX10_3-NEXT:    v_writelane_b32 v22, s54, 15
+; GFX10_3-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s59, v22, 27
-; GFX10_3-NEXT:    v_readlane_b32 s57, v22, 26
-; GFX10_3-NEXT:    v_readlane_b32 s56, v22, 25
-; GFX10_3-NEXT:    v_readlane_b32 s55, v22, 24
-; GFX10_3-NEXT:    v_readlane_b32 s54, v22, 23
-; GFX10_3-NEXT:    v_readlane_b32 s53, v22, 22
-; GFX10_3-NEXT:    v_readlane_b32 s52, v22, 21
-; GFX10_3-NEXT:    v_readlane_b32 s51, v22, 20
-; GFX10_3-NEXT:    v_readlane_b32 s50, v22, 19
-; GFX10_3-NEXT:    v_readlane_b32 s49, v22, 18
-; GFX10_3-NEXT:    v_readlane_b32 s48, v22, 17
-; GFX10_3-NEXT:    v_readlane_b32 s47, v22, 16
-; GFX10_3-NEXT:    v_readlane_b32 s46, v22, 15
-; GFX10_3-NEXT:    v_readlane_b32 s45, v22, 14
-; GFX10_3-NEXT:    v_readlane_b32 s44, v22, 13
-; GFX10_3-NEXT:    v_readlane_b32 s43, v22, 12
-; GFX10_3-NEXT:    v_readlane_b32 s42, v22, 11
-; GFX10_3-NEXT:    v_readlane_b32 s41, v22, 10
-; GFX10_3-NEXT:    v_readlane_b32 s40, v22, 9
+; GFX10_3-NEXT:    v_readlane_b32 s55, v22, 16
+; GFX10_3-NEXT:    v_readlane_b32 s54, v22, 15
+; GFX10_3-NEXT:    v_readlane_b32 s53, v22, 14
+; GFX10_3-NEXT:    v_readlane_b32 s52, v22, 13
+; GFX10_3-NEXT:    v_readlane_b32 s51, v22, 12
+; GFX10_3-NEXT:    v_readlane_b32 s50, v22, 11
+; GFX10_3-NEXT:    v_readlane_b32 s49, v22, 10
+; GFX10_3-NEXT:    v_readlane_b32 s48, v22, 9
 ; GFX10_3-NEXT:    v_readlane_b32 s39, v22, 8
 ; GFX10_3-NEXT:    v_readlane_b32 s38, v22, 7
 ; GFX10_3-NEXT:    v_readlane_b32 s37, v22, 6
@@ -2043,12 +1503,13 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    v_writelane_b32 v22, s30, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s59, s32, 0x4240
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v22, s31, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use alloca0 v0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v22, s31, 1
 ; GFX11-NEXT:    v_writelane_b32 v22, s33, 2
 ; GFX11-NEXT:    v_writelane_b32 v22, s34, 3
 ; GFX11-NEXT:    v_writelane_b32 v22, s35, 4
@@ -2056,52 +1517,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX11-NEXT:    v_writelane_b32 v22, s37, 6
 ; GFX11-NEXT:    v_writelane_b32 v22, s38, 7
 ; GFX11-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v22, s40, 9
-; GFX11-NEXT:    v_writelane_b32 v22, s41, 10
-; GFX11-NEXT:    v_writelane_b32 v22, s42, 11
-; GFX11-NEXT:    v_writelane_b32 v22, s43, 12
-; GFX11-NEXT:    v_writelane_b32 v22, s44, 13
-; GFX11-NEXT:    v_writelane_b32 v22, s45, 14
-; GFX11-NEXT:    v_writelane_b32 v22, s46, 15
-; GFX11-NEXT:    v_writelane_b32 v22, s47, 16
-; GFX11-NEXT:    v_writelane_b32 v22, s48, 17
-; GFX11-NEXT:    v_writelane_b32 v22, s49, 18
-; GFX11-NEXT:    v_writelane_b32 v22, s50, 19
-; GFX11-NEXT:    v_writelane_b32 v22, s51, 20
-; GFX11-NEXT:    v_writelane_b32 v22, s52, 21
-; GFX11-NEXT:    v_writelane_b32 v22, s53, 22
-; GFX11-NEXT:    v_writelane_b32 v22, s54, 23
-; GFX11-NEXT:    v_writelane_b32 v22, s55, 24
-; GFX11-NEXT:    v_writelane_b32 v22, s56, 25
-; GFX11-NEXT:    v_writelane_b32 v22, s57, 26
-; GFX11-NEXT:    v_writelane_b32 v22, s59, 27
-; GFX11-NEXT:    s_add_i32 s59, s32, 0x4240
-; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v22, s48, 9
+; GFX11-NEXT:    v_writelane_b32 v22, s49, 10
+; GFX11-NEXT:    v_writelane_b32 v22, s50, 11
+; GFX11-NEXT:    v_writelane_b32 v22, s51, 12
+; GFX11-NEXT:    v_writelane_b32 v22, s52, 13
+; GFX11-NEXT:    v_writelane_b32 v22, s53, 14
+; GFX11-NEXT:    v_writelane_b32 v22, s54, 15
+; GFX11-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s59, v22, 27
-; GFX11-NEXT:    v_readlane_b32 s57, v22, 26
-; GFX11-NEXT:    v_readlane_b32 s56, v22, 25
-; GFX11-NEXT:    v_readlane_b32 s55, v22, 24
-; GFX11-NEXT:    v_readlane_b32 s54, v22, 23
-; GFX11-NEXT:    v_readlane_b32 s53, v22, 22
-; GFX11-NEXT:    v_readlane_b32 s52, v22, 21
-; GFX11-NEXT:    v_readlane_b32 s51, v22, 20
-; GFX11-NEXT:    v_readlane_b32 s50, v22, 19
-; GFX11-NEXT:    v_readlane_b32 s49, v22, 18
-; GFX11-NEXT:    v_readlane_b32 s48, v22, 17
-; GFX11-NEXT:    v_readlane_b32 s47, v22, 16
-; GFX11-NEXT:    v_readlane_b32 s46, v22, 15
-; GFX11-NEXT:    v_readlane_b32 s45, v22, 14
-; GFX11-NEXT:    v_readlane_b32 s44, v22, 13
-; GFX11-NEXT:    v_readlane_b32 s43, v22, 12
-; GFX11-NEXT:    v_readlane_b32 s42, v22, 11
-; GFX11-NEXT:    v_readlane_b32 s41, v22, 10
-; GFX11-NEXT:    v_readlane_b32 s40, v22, 9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s55, v22, 16
+; GFX11-NEXT:    v_readlane_b32 s54, v22, 15
+; GFX11-NEXT:    v_readlane_b32 s53, v22, 14
+; GFX11-NEXT:    v_readlane_b32 s52, v22, 13
+; GFX11-NEXT:    v_readlane_b32 s51, v22, 12
+; GFX11-NEXT:    v_readlane_b32 s50, v22, 11
+; GFX11-NEXT:    v_readlane_b32 s49, v22, 10
+; GFX11-NEXT:    v_readlane_b32 s48, v22, 9
 ; GFX11-NEXT:    v_readlane_b32 s39, v22, 8
 ; GFX11-NEXT:    v_readlane_b32 s38, v22, 7
 ; GFX11-NEXT:    v_readlane_b32 s37, v22, 6
@@ -2130,7 +1568,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX12-NEXT:    s_add_co_i32 s59, s32, 0x4200
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
@@ -2142,52 +1582,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    v_writelane_b32 v22, s37, 6
 ; GFX12-NEXT:    v_writelane_b32 v22, s38, 7
 ; GFX12-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v22, s40, 9
-; GFX12-NEXT:    v_writelane_b32 v22, s41, 10
-; GFX12-NEXT:    v_writelane_b32 v22, s42, 11
-; GFX12-NEXT:    v_writelane_b32 v22, s43, 12
-; GFX12-NEXT:    v_writelane_b32 v22, s44, 13
-; GFX12-NEXT:    v_writelane_b32 v22, s45, 14
-; GFX12-NEXT:    v_writelane_b32 v22, s46, 15
-; GFX12-NEXT:    v_writelane_b32 v22, s47, 16
-; GFX12-NEXT:    v_writelane_b32 v22, s48, 17
-; GFX12-NEXT:    v_writelane_b32 v22, s49, 18
-; GFX12-NEXT:    v_writelane_b32 v22, s50, 19
-; GFX12-NEXT:    v_writelane_b32 v22, s51, 20
-; GFX12-NEXT:    v_writelane_b32 v22, s52, 21
-; GFX12-NEXT:    v_writelane_b32 v22, s53, 22
-; GFX12-NEXT:    v_writelane_b32 v22, s54, 23
-; GFX12-NEXT:    v_writelane_b32 v22, s55, 24
-; GFX12-NEXT:    v_writelane_b32 v22, s56, 25
-; GFX12-NEXT:    v_writelane_b32 v22, s57, 26
-; GFX12-NEXT:    v_writelane_b32 v22, s59, 27
-; GFX12-NEXT:    s_add_co_i32 s59, s32, 0x4200
-; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    v_writelane_b32 v22, s48, 9
+; GFX12-NEXT:    v_writelane_b32 v22, s49, 10
+; GFX12-NEXT:    v_writelane_b32 v22, s50, 11
+; GFX12-NEXT:    v_writelane_b32 v22, s51, 12
+; GFX12-NEXT:    v_writelane_b32 v22, s52, 13
+; GFX12-NEXT:    v_writelane_b32 v22, s53, 14
+; GFX12-NEXT:    v_writelane_b32 v22, s54, 15
+; GFX12-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s59, v22, 27
-; GFX12-NEXT:    v_readlane_b32 s57, v22, 26
-; GFX12-NEXT:    v_readlane_b32 s56, v22, 25
-; GFX12-NEXT:    v_readlane_b32 s55, v22, 24
-; GFX12-NEXT:    v_readlane_b32 s54, v22, 23
-; GFX12-NEXT:    v_readlane_b32 s53, v22, 22
-; GFX12-NEXT:    v_readlane_b32 s52, v22, 21
-; GFX12-NEXT:    v_readlane_b32 s51, v22, 20
-; GFX12-NEXT:    v_readlane_b32 s50, v22, 19
-; GFX12-NEXT:    v_readlane_b32 s49, v22, 18
-; GFX12-NEXT:    v_readlane_b32 s48, v22, 17
-; GFX12-NEXT:    v_readlane_b32 s47, v22, 16
-; GFX12-NEXT:    v_readlane_b32 s46, v22, 15
-; GFX12-NEXT:    v_readlane_b32 s45, v22, 14
-; GFX12-NEXT:    v_readlane_b32 s44, v22, 13
-; GFX12-NEXT:    v_readlane_b32 s43, v22, 12
-; GFX12-NEXT:    v_readlane_b32 s42, v22, 11
-; GFX12-NEXT:    v_readlane_b32 s41, v22, 10
-; GFX12-NEXT:    v_readlane_b32 s40, v22, 9
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s55, v22, 16
+; GFX12-NEXT:    v_readlane_b32 s54, v22, 15
+; GFX12-NEXT:    v_readlane_b32 s53, v22, 14
+; GFX12-NEXT:    v_readlane_b32 s52, v22, 13
+; GFX12-NEXT:    v_readlane_b32 s51, v22, 12
+; GFX12-NEXT:    v_readlane_b32 s50, v22, 11
+; GFX12-NEXT:    v_readlane_b32 s49, v22, 10
+; GFX12-NEXT:    v_readlane_b32 s48, v22, 9
 ; GFX12-NEXT:    v_readlane_b32 s39, v22, 8
 ; GFX12-NEXT:    v_readlane_b32 s38, v22, 7
 ; GFX12-NEXT:    v_readlane_b32 s37, v22, 6

diff  --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
index 0112453e32bfc..52f380b7f80a3 100644
--- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
+++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
@@ -8,7 +8,7 @@
 ; CHECK-LABEL: I_Quit:
 ; CHECK: .set I_Quit.num_vgpr, max(41, amdgpu.max_num_vgpr)
 ; CHECK: .set I_Quit.num_agpr, max(0, amdgpu.max_num_agpr)
-; CHECK: .set I_Quit.numbered_sgpr, max(48, amdgpu.max_num_sgpr)
+; CHECK: .set I_Quit.numbered_sgpr, max(56, amdgpu.max_num_sgpr)
 ; CHECK: .set I_Quit.private_seg_size, 16
 ; CHECK: .set I_Quit.uses_vcc, 1
 ; CHECK: .set I_Quit.uses_flat_scratch, 1
@@ -80,7 +80,7 @@ define void @P_SetThingPosition() {
 ; CHECK-LABEL: P_SetupPsprites:
 ; CHECK: .set P_SetupPsprites.num_vgpr, max(41, amdgpu.max_num_vgpr)
 ; CHECK: .set P_SetupPsprites.num_agpr, max(0, amdgpu.max_num_agpr)
-; CHECK: .set P_SetupPsprites.numbered_sgpr, max(48, amdgpu.max_num_sgpr)
+; CHECK: .set P_SetupPsprites.numbered_sgpr, max(56, amdgpu.max_num_sgpr)
 ; CHECK: .set P_SetupPsprites.private_seg_size, 16
 ; CHECK: .set P_SetupPsprites.uses_vcc, 1
 ; CHECK: .set P_SetupPsprites.uses_flat_scratch, 1
@@ -110,7 +110,7 @@ define void @HU_Start() {
 ; CHECK-LABEL: P_SpawnPlayer:
 ; CHECK: .set P_SpawnPlayer.num_vgpr, max(43, G_PlayerReborn.num_vgpr, P_SetThingPosition.num_vgpr, P_SetupPsprites.num_vgpr, HU_Start.num_vgpr)
 ; CHECK: .set P_SpawnPlayer.num_agpr, max(0, G_PlayerReborn.num_agpr, P_SetThingPosition.num_agpr, P_SetupPsprites.num_agpr, HU_Start.num_agpr)
-; CHECK: .set P_SpawnPlayer.numbered_sgpr, max(60, G_PlayerReborn.numbered_sgpr, P_SetThingPosition.numbered_sgpr, P_SetupPsprites.numbered_sgpr, HU_Start.numbered_sgpr)
+; CHECK: .set P_SpawnPlayer.numbered_sgpr, max(84, G_PlayerReborn.numbered_sgpr, P_SetThingPosition.numbered_sgpr, P_SetupPsprites.numbered_sgpr, HU_Start.numbered_sgpr)
 ; CHECK: .set P_SpawnPlayer.private_seg_size, 16+(max(G_PlayerReborn.private_seg_size, P_SetThingPosition.private_seg_size, P_SetupPsprites.private_seg_size, HU_Start.private_seg_size))
 ; CHECK: .set P_SpawnPlayer.uses_vcc, or(1, G_PlayerReborn.uses_vcc, P_SetThingPosition.uses_vcc, P_SetupPsprites.uses_vcc, HU_Start.uses_vcc)
 ; CHECK: .set P_SpawnPlayer.uses_flat_scratch, or(0, G_PlayerReborn.uses_flat_scratch, P_SetThingPosition.uses_flat_scratch, P_SetupPsprites.uses_flat_scratch, HU_Start.uses_flat_scratch)
@@ -128,7 +128,7 @@ define void @P_SpawnPlayer() {
 ; CHECK-LABEL: I_Error:
 ; CHECK: .set I_Error.num_vgpr, max(41, amdgpu.max_num_vgpr)
 ; CHECK: .set I_Error.num_agpr, max(0, amdgpu.max_num_agpr)
-; CHECK: .set I_Error.numbered_sgpr, max(48, amdgpu.max_num_sgpr)
+; CHECK: .set I_Error.numbered_sgpr, max(56, amdgpu.max_num_sgpr)
 ; CHECK: .set I_Error.private_seg_size, 16
 ; CHECK: .set I_Error.uses_vcc, 1
 ; CHECK: .set I_Error.uses_flat_scratch, 1
@@ -144,7 +144,7 @@ define void @I_Error(...) {
 ; CHECK-LABEL: G_DoReborn:
 ; CHECK: .set G_DoReborn.num_vgpr, max(44, P_RemoveMobj.num_vgpr, P_SpawnMobj.num_vgpr, P_SpawnPlayer.num_vgpr, I_Error.num_vgpr)
 ; CHECK: .set G_DoReborn.num_agpr, max(0, P_RemoveMobj.num_agpr, P_SpawnMobj.num_agpr, P_SpawnPlayer.num_agpr, I_Error.num_agpr)
-; CHECK: .set G_DoReborn.numbered_sgpr, max(72, P_RemoveMobj.numbered_sgpr, P_SpawnMobj.numbered_sgpr, P_SpawnPlayer.numbered_sgpr, I_Error.numbered_sgpr)
+; CHECK: .set G_DoReborn.numbered_sgpr, max(104, P_RemoveMobj.numbered_sgpr, P_SpawnMobj.numbered_sgpr, P_SpawnPlayer.numbered_sgpr, I_Error.numbered_sgpr)
 ; CHECK: .set G_DoReborn.private_seg_size, 32+(max(P_RemoveMobj.private_seg_size, P_SpawnMobj.private_seg_size, P_SpawnPlayer.private_seg_size, I_Error.private_seg_size))
 ; CHECK: .set G_DoReborn.uses_vcc, or(1, P_RemoveMobj.uses_vcc, P_SpawnMobj.uses_vcc, P_SpawnPlayer.uses_vcc, I_Error.uses_vcc)
 ; CHECK: .set G_DoReborn.uses_flat_scratch, or(0, P_RemoveMobj.uses_flat_scratch, P_SpawnMobj.uses_flat_scratch, P_SpawnPlayer.uses_flat_scratch, I_Error.uses_flat_scratch)
@@ -218,7 +218,7 @@ define void @F_Ticker() {
 ; CHECK-LABEL: G_CheckDemoStatus:
 ; CHECK: .set G_CheckDemoStatus.num_vgpr, max(43, I_Quit.num_vgpr, D_AdvanceDemo.num_vgpr, I_Error.num_vgpr)
 ; CHECK: .set G_CheckDemoStatus.num_agpr, max(0, I_Quit.num_agpr, D_AdvanceDemo.num_agpr, I_Error.num_agpr)
-; CHECK: .set G_CheckDemoStatus.numbered_sgpr, max(60, I_Quit.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, I_Error.numbered_sgpr)
+; CHECK: .set G_CheckDemoStatus.numbered_sgpr, max(84, I_Quit.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, I_Error.numbered_sgpr)
 ; CHECK: .set G_CheckDemoStatus.private_seg_size, 32+(max(I_Quit.private_seg_size, D_AdvanceDemo.private_seg_size, I_Error.private_seg_size))
 ; CHECK: .set G_CheckDemoStatus.uses_vcc, or(1, I_Quit.uses_vcc, D_AdvanceDemo.uses_vcc, I_Error.uses_vcc)
 ; CHECK: .set G_CheckDemoStatus.uses_flat_scratch, or(0, I_Quit.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, I_Error.uses_flat_scratch)
@@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() {
 ; CHECK-LABEL: R_FlatNumForName:
 ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr)
 ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr)
-; CHECK: .set R_FlatNumForName.numbered_sgpr, max(48, I_Error.numbered_sgpr)
+; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr)
 ; CHECK: .set R_FlatNumForName.private_seg_size, 16+(max(I_Error.private_seg_size))
 ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc)
 ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch)
@@ -279,7 +279,7 @@ define i32 @R_FlatNumForName() {
 ; CHECK-LABEL: R_TextureNumForName:
 ; CHECK: .set R_TextureNumForName.num_vgpr, max(42, R_FlatNumForName.num_vgpr)
 ; CHECK: .set R_TextureNumForName.num_agpr, max(0, R_FlatNumForName.num_agpr)
-; CHECK: .set R_TextureNumForName.numbered_sgpr, max(48, R_FlatNumForName.numbered_sgpr)
+; CHECK: .set R_TextureNumForName.numbered_sgpr, max(56, R_FlatNumForName.numbered_sgpr)
 ; CHECK: .set R_TextureNumForName.private_seg_size, 16+(max(R_FlatNumForName.private_seg_size))
 ; CHECK: .set R_TextureNumForName.uses_vcc, or(1, R_FlatNumForName.uses_vcc)
 ; CHECK: .set R_TextureNumForName.uses_flat_scratch, or(0, R_FlatNumForName.uses_flat_scratch)
@@ -292,10 +292,10 @@ define i32 @R_TextureNumForName() {
 }
 
 ; CHECK-LABEL: G_Ticker:
-; CHECK: .set G_Ticker.num_vgpr, max(46, G_DoReborn.num_vgpr, F_Ticker.num_vgpr, AM_Stop.num_vgpr, F_StartFinale.num_vgpr, D_AdvanceDemo.num_vgpr, R_FlatNumForName.num_vgpr, R_TextureNumForName.num_vgpr, P_TempSaveGameFile.num_vgpr, P_SaveGameFile.num_vgpr, I_Error.num_vgpr)
+; CHECK: .set G_Ticker.num_vgpr, max(47, G_DoReborn.num_vgpr, F_Ticker.num_vgpr, AM_Stop.num_vgpr, F_StartFinale.num_vgpr, D_AdvanceDemo.num_vgpr, R_FlatNumForName.num_vgpr, R_TextureNumForName.num_vgpr, P_TempSaveGameFile.num_vgpr, P_SaveGameFile.num_vgpr, I_Error.num_vgpr)
 ; CHECK: .set G_Ticker.num_agpr, max(0, G_DoReborn.num_agpr, F_Ticker.num_agpr, AM_Stop.num_agpr, F_StartFinale.num_agpr, D_AdvanceDemo.num_agpr, R_FlatNumForName.num_agpr, R_TextureNumForName.num_agpr, P_TempSaveGameFile.num_agpr, P_SaveGameFile.num_agpr, I_Error.num_agpr)
-; CHECK: .set G_Ticker.numbered_sgpr, max(84, G_DoReborn.numbered_sgpr, F_Ticker.numbered_sgpr, AM_Stop.numbered_sgpr, F_StartFinale.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, R_FlatNumForName.numbered_sgpr, R_TextureNumForName.numbered_sgpr, P_TempSaveGameFile.numbered_sgpr, P_SaveGameFile.numbered_sgpr, I_Error.numbered_sgpr)
-; CHECK: .set G_Ticker.private_seg_size, 32+(max(G_DoReborn.private_seg_size, F_Ticker.private_seg_size, AM_Stop.private_seg_size, F_StartFinale.private_seg_size, D_AdvanceDemo.private_seg_size, R_FlatNumForName.private_seg_size, R_TextureNumForName.private_seg_size, P_TempSaveGameFile.private_seg_size, P_SaveGameFile.private_seg_size, I_Error.private_seg_size))
+; CHECK: .set G_Ticker.numbered_sgpr, max(105, G_DoReborn.numbered_sgpr, F_Ticker.numbered_sgpr, AM_Stop.numbered_sgpr, F_StartFinale.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, R_FlatNumForName.numbered_sgpr, R_TextureNumForName.numbered_sgpr, P_TempSaveGameFile.numbered_sgpr, P_SaveGameFile.numbered_sgpr, I_Error.numbered_sgpr)
+; CHECK: .set G_Ticker.private_seg_size, 48+(max(G_DoReborn.private_seg_size, F_Ticker.private_seg_size, AM_Stop.private_seg_size, F_StartFinale.private_seg_size, D_AdvanceDemo.private_seg_size, R_FlatNumForName.private_seg_size, R_TextureNumForName.private_seg_size, P_TempSaveGameFile.private_seg_size, P_SaveGameFile.private_seg_size, I_Error.private_seg_size))
 ; CHECK: .set G_Ticker.uses_vcc, or(1, G_DoReborn.uses_vcc, F_Ticker.uses_vcc, AM_Stop.uses_vcc, F_StartFinale.uses_vcc, D_AdvanceDemo.uses_vcc, R_FlatNumForName.uses_vcc, R_TextureNumForName.uses_vcc, P_TempSaveGameFile.uses_vcc, P_SaveGameFile.uses_vcc, I_Error.uses_vcc)
 ; CHECK: .set G_Ticker.uses_flat_scratch, or(0, G_DoReborn.uses_flat_scratch, F_Ticker.uses_flat_scratch, AM_Stop.uses_flat_scratch, F_StartFinale.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, R_FlatNumForName.uses_flat_scratch, R_TextureNumForName.uses_flat_scratch, P_TempSaveGameFile.uses_flat_scratch, P_SaveGameFile.uses_flat_scratch, I_Error.uses_flat_scratch)
 ; CHECK: .set G_Ticker.has_dyn_sized_stack, or(0, G_DoReborn.has_dyn_sized_stack, F_Ticker.has_dyn_sized_stack, AM_Stop.has_dyn_sized_stack, F_StartFinale.has_dyn_sized_stack, D_AdvanceDemo.has_dyn_sized_stack, R_FlatNumForName.has_dyn_sized_stack, R_TextureNumForName.has_dyn_sized_stack, P_TempSaveGameFile.has_dyn_sized_stack, P_SaveGameFile.has_dyn_sized_stack, I_Error.has_dyn_sized_stack)
@@ -316,9 +316,9 @@ define void @G_Ticker() {
 }
 
 ; CHECK-LABEL: RunTic:
-; CHECK: .set RunTic.num_vgpr, max(46, G_CheckDemoStatus.num_vgpr, D_AdvanceDemo.num_vgpr, G_Ticker.num_vgpr)
+; CHECK: .set RunTic.num_vgpr, max(47, G_CheckDemoStatus.num_vgpr, D_AdvanceDemo.num_vgpr, G_Ticker.num_vgpr)
 ; CHECK: .set RunTic.num_agpr, max(0, G_CheckDemoStatus.num_agpr, D_AdvanceDemo.num_agpr, G_Ticker.num_agpr)
-; CHECK: .set RunTic.numbered_sgpr, max(84, G_CheckDemoStatus.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, G_Ticker.numbered_sgpr)
+; CHECK: .set RunTic.numbered_sgpr, max(105, G_CheckDemoStatus.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, G_Ticker.numbered_sgpr)
 ; CHECK: .set RunTic.private_seg_size, 32+(max(G_CheckDemoStatus.private_seg_size, D_AdvanceDemo.private_seg_size, G_Ticker.private_seg_size))
 ; CHECK: .set RunTic.uses_vcc, or(1, G_CheckDemoStatus.uses_vcc, D_AdvanceDemo.uses_vcc, G_Ticker.uses_vcc)
 ; CHECK: .set RunTic.uses_flat_scratch, or(0, G_CheckDemoStatus.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, G_Ticker.uses_flat_scratch)

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
index ba6524caf668d..05cbd4c2a010d 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@@ -27,39 +27,25 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
-    ; CHECK: liveins: $vgpr1, $vgpr2
+    ; CHECK: liveins: $sgpr40, $sgpr41, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $sgpr4 = COPY $sgpr33
+    ; CHECK-NEXT: $sgpr40 = frame-setup COPY $sgpr33
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
-    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
-    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; CHECK-NEXT: $sgpr41 = frame-setup COPY $sgpr34
     ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc
-    ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc
-    ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK-NEXT: $sgpr42 = S_MOV_B32 8192
+    ; CHECK-NEXT: $vgpr0, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr0, 0, implicit $exec
+    ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK-NEXT: $sgpr42 = S_MOV_B32 16384
+    ; CHECK-NEXT: $vgpr2, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr2, 0, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
-    ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
-    ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
-    ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
-    ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; CHECK-NEXT: $sgpr33 = COPY $sgpr4
+    ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr41
+    ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr40
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -89,36 +75,24 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr
-    ; CHECK: liveins: $sgpr29, $vgpr1, $vgpr2
+    ; CHECK: liveins: $sgpr29, $sgpr40, $vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr33
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
-    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
-    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2
+    ; CHECK-NEXT: $sgpr40 = frame-setup COPY $sgpr34
     ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc
-    ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc
-    ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31
+    ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK-NEXT: $sgpr42 = S_MOV_B32 8192
+    ; CHECK-NEXT: $vgpr0, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr0, 0, implicit $exec
+    ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK-NEXT: $sgpr42 = S_MOV_B32 16384
+    ; CHECK-NEXT: $vgpr2, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr2, 0, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31
     ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
-    ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
-    ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
-    ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr40
     ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr29
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -158,16 +132,12 @@ body:             |
     ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc
-    ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc
-    ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK-NEXT: $sgpr40 = S_MOV_B32 8192
+    ; CHECK-NEXT: $vgpr0, dead $sgpr40_sgpr41 = V_ADD_CO_U32_e64 killed $sgpr40, killed $vgpr0, 0, implicit $exec
+    ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK-NEXT: $sgpr40 = S_MOV_B32 16384
+    ; CHECK-NEXT: $vgpr2, dead $sgpr40_sgpr41 = V_ADD_CO_U32_e64 killed $sgpr40, killed $vgpr2, 0, implicit $exec
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31
     ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
     ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr29

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
index 162d12f651d4a..4f1c9a20fddc3 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@@ -23,64 +23,42 @@ body:             |
     liveins: $vgpr1
 
     ; MUBUF-LABEL: name: scavenge_sgpr_pei_no_sgprs
-    ; MUBUF: liveins: $vgpr1, $vgpr2
+    ; MUBUF: liveins: $sgpr40, $sgpr41, $vgpr1
     ; MUBUF-NEXT: {{  $}}
-    ; MUBUF-NEXT: $sgpr4 = COPY $sgpr33
+    ; MUBUF-NEXT: $sgpr40 = frame-setup COPY $sgpr33
     ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
-    ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-    ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
-    ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; MUBUF-NEXT: $sgpr41 = frame-setup COPY $sgpr34
     ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; MUBUF-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec
-    ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec
-    ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; MUBUF-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; MUBUF-NEXT: $vgpr2 = V_ADD_U32_e32 16384, killed $vgpr2, implicit $exec
+    ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
-    ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
-    ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
-    ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
-    ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-    ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4
+    ; MUBUF-NEXT: $sgpr34 = frame-destroy COPY $sgpr41
+    ; MUBUF-NEXT: $sgpr33 = frame-destroy COPY $sgpr40
     ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc
     ;
     ; FLATSCR-LABEL: name: scavenge_sgpr_pei_no_sgprs
-    ; FLATSCR: liveins: $vgpr1, $vgpr2
+    ; FLATSCR: liveins: $sgpr40, $sgpr41, $vgpr1
     ; FLATSCR-NEXT: {{  $}}
-    ; FLATSCR-NEXT: $sgpr4 = COPY $sgpr33
+    ; FLATSCR-NEXT: $sgpr40 = frame-setup COPY $sgpr33
     ; FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
-    ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc
-    ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
-    ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
-    ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; FLATSCR-NEXT: $sgpr41 = frame-setup COPY $sgpr34
     ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
-    ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec
-    ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -8192, implicit-def $scc
-    ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
-    ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
-    ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc
+    ; FLATSCR-NEXT: $sgpr42 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
+    ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr42, implicit $exec
+    ; FLATSCR-NEXT: $sgpr42 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
+    ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr42, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
-    ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
-    ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
-    ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc
-    ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
-    ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4
+    ; FLATSCR-NEXT: $sgpr34 = frame-destroy COPY $sgpr41
+    ; FLATSCR-NEXT: $sgpr33 = frame-destroy COPY $sgpr40
     ; FLATSCR-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
index a4f936a4d705c..480859a09a347 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
@@ -22,34 +22,22 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei
-    ; CHECK: liveins: $vgpr1, $vgpr2
+    ; CHECK: liveins: $sgpr40, $sgpr41, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $sgpr4 = COPY $sgpr33
+    ; CHECK-NEXT: $sgpr40 = frame-setup COPY $sgpr33
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 262080, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294705152, implicit-def dead $scc
-    ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc
-    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
-    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; CHECK-NEXT: $sgpr41 = frame-setup COPY $sgpr34
     ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 4096, implicit-def $scc
-    ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33
-    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc
-    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK-NEXT: $sgpr42 = S_MOV_B32 4096
+    ; CHECK-NEXT: $vgpr2, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr2, 0, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
-    ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
-    ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
-    ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc
-    ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; CHECK-NEXT: $sgpr33 = COPY $sgpr4
+    ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr41
+    ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr40
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     $vgpr0 = V_OR_B32_e32 %stack.0, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 33720ea9b28e6..ee89bf406c2a3 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -1776,20 +1776,20 @@ entry:
 define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
 ; GFX8-LABEL: DiffBase:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT:    s_mov_b32 s42, -1
-; GFX8-NEXT:    s_mov_b32 s43, 0xe80000
-; GFX8-NEXT:    s_add_u32 s40, s40, s11
-; GFX8-NEXT:    s_addc_u32 s41, s41, 0
+; GFX8-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT:    s_mov_b32 s50, -1
+; GFX8-NEXT:    s_mov_b32 s51, 0xe80000
+; GFX8-NEXT:    s_add_u32 s48, s48, s11
+; GFX8-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX8-NEXT:    s_getpc_b64 s[0:1]
 ; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
 ; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX8-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX8-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GFX8-NEXT:    v_mov_b32_e32 v31, v0
-; GFX8-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX8-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1839,20 +1839,20 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
 ;
 ; GFX9-LABEL: DiffBase:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s42, -1
-; GFX9-NEXT:    s_mov_b32 s43, 0xe00000
-; GFX9-NEXT:    s_add_u32 s40, s40, s11
-; GFX9-NEXT:    s_addc_u32 s41, s41, 0
+; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s50, -1
+; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
+; GFX9-NEXT:    s_add_u32 s48, s48, s11
+; GFX9-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1898,12 +1898,12 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
 ;
 ; GFX10-LABEL: DiffBase:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT:    s_mov_b32 s42, -1
-; GFX10-NEXT:    s_mov_b32 s43, 0x31c16000
-; GFX10-NEXT:    s_add_u32 s40, s40, s11
-; GFX10-NEXT:    s_addc_u32 s41, s41, 0
+; GFX10-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX10-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX10-NEXT:    s_mov_b32 s50, -1
+; GFX10-NEXT:    s_mov_b32 s51, 0x31c16000
+; GFX10-NEXT:    s_add_u32 s48, s48, s11
+; GFX10-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX10-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
@@ -1911,8 +1911,8 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mov_b64 s[0:1], s[40:41]
-; GFX10-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
index c6ee557d970cd..4a0bb6ceccd3f 100644
--- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
+++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
@@ -41,63 +41,102 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr38_sgpr39 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr40_sgpr41 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr60 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $sgpr42_sgpr43 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr44_sgpr45 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr61 = S_MOV_B32 1083786240
+  ; CHECK-NEXT:   renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr56 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr57 = S_MOV_B32 1083786240
+  ; CHECK-NEXT:   SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5)
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.17(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr44_sgpr45, implicit-def dead $scc
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.17
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.5(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr64 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr65 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr66 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr67 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr68 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr69 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr70 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr71 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr72 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr73 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr74 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr75 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr76 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr77 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr78 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr79 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr80 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr81 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr82 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr83 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr84 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr85 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr86 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr87 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr88 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr89 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr90 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr91 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr92 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr93 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr94 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   renamable $sgpr95 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $exec
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr40 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr41 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr42 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr43 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr44 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr45 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr46 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr47 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr48 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr49 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr50 = COPY renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr51 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr56 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  ; CHECK-NEXT:   renamable $sgpr52 = COPY renamable $sgpr56
+  ; CHECK-NEXT:   renamable $sgpr53 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57 = COPY renamable $sgpr52_sgpr53
+  ; CHECK-NEXT:   renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57
+  ; CHECK-NEXT:   renamable $sgpr54 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
+  ; CHECK-NEXT:   renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50
+  ; CHECK-NEXT:   renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+  ; CHECK-NEXT:   renamable $sgpr55 = COPY killed renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr56 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr57 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr58 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr59 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr60 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr61 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr62 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr63 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr64 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr65 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr66 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr67 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+  ; CHECK-NEXT:   renamable $sgpr64 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr65 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr66 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr67 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
@@ -126,111 +165,117 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.12(0x40000000), %bb.6(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = S_AND_B64 renamable $sgpr38_sgpr39, undef renamable $sgpr46_sgpr47, implicit-def dead $scc
-  ; CHECK-NEXT:   renamable $sgpr46_sgpr47 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.12, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr40_sgpr41, implicit $exec
+  ; CHECK-NEXT:   dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr48_sgpr49 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr50_sgpr51 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr14, 11, implicit-def $m0, implicit $m0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   successors: %bb.10(0x40000000), %bb.9(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.10, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.9:
   ; CHECK-NEXT:   successors: %bb.10(0x40000000), %bb.17(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY renamable $sgpr60_sgpr61, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec
   ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1)
-  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr18_sgpr19, implicit $exec
   ; CHECK-NEXT:   dead renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr58 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr82 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   renamable $sgpr52_sgpr53 = COPY killed renamable $sgpr4_sgpr5
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY renamable $sgpr52_sgpr53
-  ; CHECK-NEXT:   renamable $sgpr54_sgpr55 = COPY killed renamable $sgpr6_sgpr7
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY renamable $sgpr54_sgpr55
-  ; CHECK-NEXT:   renamable $sgpr56_sgpr57 = COPY killed renamable $sgpr10_sgpr11
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY renamable $sgpr56_sgpr57
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69 = COPY killed renamable $sgpr4_sgpr5
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY renamable $sgpr68_sgpr69
+  ; CHECK-NEXT:   renamable $sgpr70_sgpr71 = COPY killed renamable $sgpr6_sgpr7
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY renamable $sgpr70_sgpr71
+  ; CHECK-NEXT:   renamable $sgpr80_sgpr81 = COPY killed renamable $sgpr10_sgpr11
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY renamable $sgpr80_sgpr81
   ; CHECK-NEXT:   $sgpr12 = COPY renamable $sgpr14
   ; CHECK-NEXT:   $sgpr13 = COPY renamable $sgpr15
-  ; CHECK-NEXT:   renamable $sgpr62 = COPY killed renamable $sgpr8
+  ; CHECK-NEXT:   renamable $sgpr84 = COPY killed renamable $sgpr8
   ; CHECK-NEXT:   renamable $sgpr33 = COPY killed renamable $sgpr16
-  ; CHECK-NEXT:   renamable $sgpr59 = COPY killed renamable $sgpr15
-  ; CHECK-NEXT:   renamable $sgpr63 = COPY killed renamable $sgpr14
+  ; CHECK-NEXT:   renamable $sgpr83 = COPY killed renamable $sgpr15
+  ; CHECK-NEXT:   renamable $sgpr85 = COPY killed renamable $sgpr14
+  ; CHECK-NEXT:   renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY renamable $sgpr58_sgpr59
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9
-  ; CHECK-NEXT:   renamable $sgpr14 = COPY killed renamable $sgpr63
-  ; CHECK-NEXT:   renamable $sgpr15 = COPY killed renamable $sgpr59
+  ; CHECK-NEXT:   renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49
+  ; CHECK-NEXT:   renamable $sgpr14 = COPY killed renamable $sgpr85
+  ; CHECK-NEXT:   renamable $sgpr15 = COPY killed renamable $sgpr83
   ; CHECK-NEXT:   renamable $sgpr16 = COPY killed renamable $sgpr33
-  ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = COPY killed renamable $sgpr52_sgpr53
-  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr54_sgpr55
-  ; CHECK-NEXT:   renamable $sgpr8 = COPY killed renamable $sgpr62
-  ; CHECK-NEXT:   renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr56_sgpr57
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = COPY killed renamable $sgpr68_sgpr69
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr70_sgpr71
+  ; CHECK-NEXT:   renamable $sgpr8 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr80_sgpr81
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term renamable $sgpr50_sgpr51
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term renamable $sgpr66_sgpr67
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.10, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.17
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.10:
   ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.12(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.8, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.12
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.11:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.17(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.17
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.12:
   ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.13(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr46_sgpr47
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.13:
   ; CHECK-NEXT:   successors: %bb.15(0x40000000), %bb.14(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr42_sgpr43, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.15, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.14
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.14:
   ; CHECK-NEXT:   successors: %bb.15(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.15:
   ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.16(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr36_sgpr37, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.11, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.16:

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
index 570ea4b7132aa..0d25bc97ff775 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -246,10 +246,10 @@ bb3:
 ; CHECK-LABEL: {{^}}spill_func:
 ; GCN:    NumSgprs: 104
 ; GCN-GCNTRACKERS:    NumSgprs: 104
-; GCN:    NumVgprs: 3
-; GCN-GCNTRACKERS:    NumVgprs: 4
-; GCN:    ScratchSize: 12
-; GCN-GCNTRACKERS:    ScratchSize: 16
+; GCN:    NumVgprs: 2
+; GCN-GCNTRACKERS:    NumVgprs: 3
+; GCN:    ScratchSize: 8
+; GCN-GCNTRACKERS:    ScratchSize: 12
 
 define void @spill_func(ptr addrspace(1) %arg) #0 {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index d345b57d3d08b..e920fdee51815 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -1903,19 +1903,14 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32>
 ; VI-LABEL: v_vselect_v16f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v31, s30, 0
-; VI-NEXT:    v_writelane_b32 v31, s31, 1
 ; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
 ; VI-NEXT:    v_cmp_eq_u32_e64 s[18:19], 0, v17
-; VI-NEXT:    v_cmp_eq_u32_e64 s[30:31], 0, v29
+; VI-NEXT:    v_cmp_eq_u32_e64 s[40:41], 0, v29
 ; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; VI-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v18
 ; VI-NEXT:    v_cmp_eq_u32_e64 s[28:29], 0, v27
-; VI-NEXT:    v_cndmask_b32_e64 v16, v17, v16, s[30:31]
+; VI-NEXT:    v_cndmask_b32_e64 v16, v17, v16, s[40:41]
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; VI-NEXT:    v_cmp_eq_u32_e64 s[20:21], 0, v19
@@ -1957,8 +1952,6 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32>
 ; VI-NEXT:    v_cmp_eq_u32_e64 s[16:17], 0, v28
 ; VI-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[14:15]
 ; VI-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[16:17]
-; VI-NEXT:    v_readlane_b32 s31, v31, 1
-; VI-NEXT:    v_readlane_b32 s30, v31, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; VI-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
@@ -1976,10 +1969,6 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32>
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v16f16:

diff  --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index dc227f745aa9a..7ee7c83e0122d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -15008,18 +15008,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -15033,18 +15025,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -15065,18 +15049,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -15090,18 +15066,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -15158,21 +15126,13 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -15183,18 +15143,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -15215,21 +15167,13 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -15240,18 +15184,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -16122,18 +16058,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -16145,18 +16073,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -16177,18 +16097,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -16200,18 +16112,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -16883,18 +16787,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -16906,18 +16802,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -16938,18 +16826,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -16961,18 +16841,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -20983,18 +20855,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -21008,18 +20872,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -21040,18 +20896,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -21065,18 +20913,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -21189,18 +21029,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -21214,18 +21046,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -21246,18 +21070,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -21271,18 +21087,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -21339,18 +21147,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -21364,18 +21164,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -21396,18 +21188,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -21421,18 +21205,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -21477,18 +21253,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -21502,18 +21270,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -21534,18 +21294,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -21559,18 +21311,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -21627,18 +21371,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -21652,18 +21388,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -21684,18 +21412,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -21709,18 +21429,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22203,21 +21915,13 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -22228,18 +21932,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22260,21 +21956,13 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -22285,18 +21973,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22409,21 +22089,13 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -22434,18 +22106,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22466,21 +22130,13 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -22491,18 +22147,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22559,18 +22207,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -22584,18 +22224,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22616,18 +22248,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -22641,18 +22265,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22697,21 +22313,13 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -22722,18 +22330,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22754,21 +22354,13 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -22779,18 +22371,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22847,18 +22431,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -22872,18 +22448,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -22904,18 +22472,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -22929,18 +22489,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -23861,18 +23413,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -23884,18 +23428,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -23916,18 +23452,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -23939,18 +23467,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -24166,21 +23686,13 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -24191,18 +23703,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -24223,21 +23727,13 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -24248,18 +23744,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -25042,21 +24530,13 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -25067,18 +24547,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -25099,21 +24571,13 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -25124,18 +24588,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -25192,18 +24648,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -25217,18 +24665,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -25249,18 +24689,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -25274,18 +24706,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -25737,18 +25161,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -25760,18 +25176,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -25792,18 +25200,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -25815,18 +25215,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -26154,21 +25546,13 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -26179,18 +25563,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -26211,21 +25587,13 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -26236,18 +25604,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -27030,21 +26390,13 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -27055,18 +26407,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -27087,21 +26431,13 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -27112,18 +26448,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -27180,18 +26508,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -27205,18 +26525,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -27237,18 +26549,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -27262,18 +26566,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -27989,21 +27285,13 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -28014,18 +27302,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -28046,21 +27326,13 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -28071,18 +27343,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -28139,18 +27403,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -28164,18 +27420,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -28196,18 +27444,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -28221,18 +27461,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -29011,21 +28243,13 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -29036,18 +28260,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -29068,21 +28284,13 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -29093,18 +28301,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -29161,18 +28361,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -29186,18 +28378,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -29218,18 +28402,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -29243,18 +28419,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -30057,21 +29225,13 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -30082,18 +29242,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -30114,21 +29266,13 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -30139,18 +29283,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
@@ -30207,18 +29343,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 ; GFX900-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[36:51]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -30232,18 +29360,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX900-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v0, 1
@@ -30264,18 +29384,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 ; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
 ; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
 ; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s40, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s41, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s42, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s43, 7
-; GFX90A-NEXT:    v_writelane_b32 v0, s44, 8
-; GFX90A-NEXT:    v_writelane_b32 v0, s45, 9
-; GFX90A-NEXT:    v_writelane_b32 v0, s46, 10
-; GFX90A-NEXT:    v_writelane_b32 v0, s47, 11
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 12
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 13
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 14
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 15
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[36:51]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -30289,18 +29401,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 15
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 14
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 13
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 12
-; GFX90A-NEXT:    v_readlane_b32 s47, v0, 11
-; GFX90A-NEXT:    v_readlane_b32 s46, v0, 10
-; GFX90A-NEXT:    v_readlane_b32 s45, v0, 9
-; GFX90A-NEXT:    v_readlane_b32 s44, v0, 8
-; GFX90A-NEXT:    v_readlane_b32 s43, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s42, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s41, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s40, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
 ; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
 ; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
 ; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index b57adfe7d9306..0221bb0cf4f35 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -612,40 +612,40 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    v_writelane_b32 v40, s37, 5
 ; FIJI-NEXT:    v_writelane_b32 v40, s38, 6
 ; FIJI-NEXT:    v_writelane_b32 v40, s39, 7
-; FIJI-NEXT:    v_writelane_b32 v40, s40, 8
-; FIJI-NEXT:    v_writelane_b32 v40, s41, 9
-; FIJI-NEXT:    v_writelane_b32 v40, s42, 10
-; FIJI-NEXT:    v_writelane_b32 v40, s43, 11
-; FIJI-NEXT:    v_writelane_b32 v40, s44, 12
-; FIJI-NEXT:    v_writelane_b32 v40, s45, 13
-; FIJI-NEXT:    v_writelane_b32 v40, s46, 14
-; FIJI-NEXT:    v_writelane_b32 v40, s47, 15
-; FIJI-NEXT:    v_writelane_b32 v40, s48, 16
-; FIJI-NEXT:    s_mov_b32 s42, s15
-; FIJI-NEXT:    s_mov_b32 s43, s14
-; FIJI-NEXT:    s_mov_b32 s44, s13
-; FIJI-NEXT:    s_mov_b32 s45, s12
+; FIJI-NEXT:    v_writelane_b32 v40, s48, 8
+; FIJI-NEXT:    v_writelane_b32 v40, s49, 9
+; FIJI-NEXT:    v_writelane_b32 v40, s50, 10
+; FIJI-NEXT:    v_writelane_b32 v40, s51, 11
+; FIJI-NEXT:    v_writelane_b32 v40, s52, 12
+; FIJI-NEXT:    v_writelane_b32 v40, s53, 13
+; FIJI-NEXT:    v_writelane_b32 v40, s54, 14
+; FIJI-NEXT:    v_writelane_b32 v40, s55, 15
+; FIJI-NEXT:    v_writelane_b32 v40, s64, 16
+; FIJI-NEXT:    s_mov_b32 s50, s15
+; FIJI-NEXT:    s_mov_b32 s51, s14
+; FIJI-NEXT:    s_mov_b32 s52, s13
+; FIJI-NEXT:    s_mov_b32 s53, s12
 ; FIJI-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; FIJI-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; FIJI-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; FIJI-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; FIJI-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; FIJI-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
-; FIJI-NEXT:    s_mov_b64 s[46:47], exec
+; FIJI-NEXT:    s_mov_b64 s[54:55], exec
 ; FIJI-NEXT:    s_addk_i32 s32, 0x400
-; FIJI-NEXT:    v_writelane_b32 v40, s49, 17
+; FIJI-NEXT:    v_writelane_b32 v40, s65, 17
 ; FIJI-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; FIJI-NEXT:    v_readfirstlane_b32 s16, v0
 ; FIJI-NEXT:    v_readfirstlane_b32 s17, v1
 ; FIJI-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; FIJI-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; FIJI-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; FIJI-NEXT:    s_and_saveexec_b64 s[64:65], vcc
+; FIJI-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; FIJI-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; FIJI-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; FIJI-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; FIJI-NEXT:    s_mov_b32 s12, s45
-; FIJI-NEXT:    s_mov_b32 s13, s44
-; FIJI-NEXT:    s_mov_b32 s14, s43
-; FIJI-NEXT:    s_mov_b32 s15, s42
+; FIJI-NEXT:    s_mov_b32 s12, s53
+; FIJI-NEXT:    s_mov_b32 s13, s52
+; FIJI-NEXT:    s_mov_b32 s14, s51
+; FIJI-NEXT:    s_mov_b32 s15, s50
 ; FIJI-NEXT:    v_mov_b32_e32 v0, v2
 ; FIJI-NEXT:    v_mov_b32_e32 v1, v3
 ; FIJI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -654,21 +654,21 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    ; implicit-def: $vgpr31
 ; FIJI-NEXT:    ; implicit-def: $vgpr2
 ; FIJI-NEXT:    ; implicit-def: $vgpr3
-; FIJI-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; FIJI-NEXT:    s_xor_b64 exec, exec, s[64:65]
 ; FIJI-NEXT:    s_cbranch_execnz .LBB18_1
 ; FIJI-NEXT:  ; %bb.2:
-; FIJI-NEXT:    s_mov_b64 exec, s[46:47]
+; FIJI-NEXT:    s_mov_b64 exec, s[54:55]
 ; FIJI-NEXT:    v_mov_b32_e32 v0, v4
-; FIJI-NEXT:    v_readlane_b32 s49, v40, 17
-; FIJI-NEXT:    v_readlane_b32 s48, v40, 16
-; FIJI-NEXT:    v_readlane_b32 s47, v40, 15
-; FIJI-NEXT:    v_readlane_b32 s46, v40, 14
-; FIJI-NEXT:    v_readlane_b32 s45, v40, 13
-; FIJI-NEXT:    v_readlane_b32 s44, v40, 12
-; FIJI-NEXT:    v_readlane_b32 s43, v40, 11
-; FIJI-NEXT:    v_readlane_b32 s42, v40, 10
-; FIJI-NEXT:    v_readlane_b32 s41, v40, 9
-; FIJI-NEXT:    v_readlane_b32 s40, v40, 8
+; FIJI-NEXT:    v_readlane_b32 s65, v40, 17
+; FIJI-NEXT:    v_readlane_b32 s64, v40, 16
+; FIJI-NEXT:    v_readlane_b32 s55, v40, 15
+; FIJI-NEXT:    v_readlane_b32 s54, v40, 14
+; FIJI-NEXT:    v_readlane_b32 s53, v40, 13
+; FIJI-NEXT:    v_readlane_b32 s52, v40, 12
+; FIJI-NEXT:    v_readlane_b32 s51, v40, 11
+; FIJI-NEXT:    v_readlane_b32 s50, v40, 10
+; FIJI-NEXT:    v_readlane_b32 s49, v40, 9
+; FIJI-NEXT:    v_readlane_b32 s48, v40, 8
 ; FIJI-NEXT:    v_readlane_b32 s39, v40, 7
 ; FIJI-NEXT:    v_readlane_b32 s38, v40, 6
 ; FIJI-NEXT:    v_readlane_b32 s37, v40, 5
@@ -703,40 +703,40 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    v_writelane_b32 v40, s37, 5
 ; HAWAII-NEXT:    v_writelane_b32 v40, s38, 6
 ; HAWAII-NEXT:    v_writelane_b32 v40, s39, 7
-; HAWAII-NEXT:    v_writelane_b32 v40, s40, 8
-; HAWAII-NEXT:    v_writelane_b32 v40, s41, 9
-; HAWAII-NEXT:    v_writelane_b32 v40, s42, 10
-; HAWAII-NEXT:    v_writelane_b32 v40, s43, 11
-; HAWAII-NEXT:    v_writelane_b32 v40, s44, 12
-; HAWAII-NEXT:    v_writelane_b32 v40, s45, 13
-; HAWAII-NEXT:    v_writelane_b32 v40, s46, 14
-; HAWAII-NEXT:    v_writelane_b32 v40, s47, 15
-; HAWAII-NEXT:    v_writelane_b32 v40, s48, 16
-; HAWAII-NEXT:    s_mov_b32 s42, s15
-; HAWAII-NEXT:    s_mov_b32 s43, s14
-; HAWAII-NEXT:    s_mov_b32 s44, s13
-; HAWAII-NEXT:    s_mov_b32 s45, s12
+; HAWAII-NEXT:    v_writelane_b32 v40, s48, 8
+; HAWAII-NEXT:    v_writelane_b32 v40, s49, 9
+; HAWAII-NEXT:    v_writelane_b32 v40, s50, 10
+; HAWAII-NEXT:    v_writelane_b32 v40, s51, 11
+; HAWAII-NEXT:    v_writelane_b32 v40, s52, 12
+; HAWAII-NEXT:    v_writelane_b32 v40, s53, 13
+; HAWAII-NEXT:    v_writelane_b32 v40, s54, 14
+; HAWAII-NEXT:    v_writelane_b32 v40, s55, 15
+; HAWAII-NEXT:    v_writelane_b32 v40, s64, 16
+; HAWAII-NEXT:    s_mov_b32 s50, s15
+; HAWAII-NEXT:    s_mov_b32 s51, s14
+; HAWAII-NEXT:    s_mov_b32 s52, s13
+; HAWAII-NEXT:    s_mov_b32 s53, s12
 ; HAWAII-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; HAWAII-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; HAWAII-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; HAWAII-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; HAWAII-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; HAWAII-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; HAWAII-NEXT:    s_mov_b64 s[46:47], exec
+; HAWAII-NEXT:    s_mov_b64 s[54:55], exec
 ; HAWAII-NEXT:    s_addk_i32 s32, 0x400
-; HAWAII-NEXT:    v_writelane_b32 v40, s49, 17
+; HAWAII-NEXT:    v_writelane_b32 v40, s65, 17
 ; HAWAII-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; HAWAII-NEXT:    v_readfirstlane_b32 s16, v0
 ; HAWAII-NEXT:    v_readfirstlane_b32 s17, v1
 ; HAWAII-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; HAWAII-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; HAWAII-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; HAWAII-NEXT:    s_and_saveexec_b64 s[64:65], vcc
+; HAWAII-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; HAWAII-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; HAWAII-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; HAWAII-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; HAWAII-NEXT:    s_mov_b32 s12, s45
-; HAWAII-NEXT:    s_mov_b32 s13, s44
-; HAWAII-NEXT:    s_mov_b32 s14, s43
-; HAWAII-NEXT:    s_mov_b32 s15, s42
+; HAWAII-NEXT:    s_mov_b32 s12, s53
+; HAWAII-NEXT:    s_mov_b32 s13, s52
+; HAWAII-NEXT:    s_mov_b32 s14, s51
+; HAWAII-NEXT:    s_mov_b32 s15, s50
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, v2
 ; HAWAII-NEXT:    v_mov_b32_e32 v1, v3
 ; HAWAII-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -745,21 +745,21 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    ; implicit-def: $vgpr31
 ; HAWAII-NEXT:    ; implicit-def: $vgpr2
 ; HAWAII-NEXT:    ; implicit-def: $vgpr3
-; HAWAII-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; HAWAII-NEXT:    s_xor_b64 exec, exec, s[64:65]
 ; HAWAII-NEXT:    s_cbranch_execnz .LBB18_1
 ; HAWAII-NEXT:  ; %bb.2:
-; HAWAII-NEXT:    s_mov_b64 exec, s[46:47]
+; HAWAII-NEXT:    s_mov_b64 exec, s[54:55]
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, v4
-; HAWAII-NEXT:    v_readlane_b32 s49, v40, 17
-; HAWAII-NEXT:    v_readlane_b32 s48, v40, 16
-; HAWAII-NEXT:    v_readlane_b32 s47, v40, 15
-; HAWAII-NEXT:    v_readlane_b32 s46, v40, 14
-; HAWAII-NEXT:    v_readlane_b32 s45, v40, 13
-; HAWAII-NEXT:    v_readlane_b32 s44, v40, 12
-; HAWAII-NEXT:    v_readlane_b32 s43, v40, 11
-; HAWAII-NEXT:    v_readlane_b32 s42, v40, 10
-; HAWAII-NEXT:    v_readlane_b32 s41, v40, 9
-; HAWAII-NEXT:    v_readlane_b32 s40, v40, 8
+; HAWAII-NEXT:    v_readlane_b32 s65, v40, 17
+; HAWAII-NEXT:    v_readlane_b32 s64, v40, 16
+; HAWAII-NEXT:    v_readlane_b32 s55, v40, 15
+; HAWAII-NEXT:    v_readlane_b32 s54, v40, 14
+; HAWAII-NEXT:    v_readlane_b32 s53, v40, 13
+; HAWAII-NEXT:    v_readlane_b32 s52, v40, 12
+; HAWAII-NEXT:    v_readlane_b32 s51, v40, 11
+; HAWAII-NEXT:    v_readlane_b32 s50, v40, 10
+; HAWAII-NEXT:    v_readlane_b32 s49, v40, 9
+; HAWAII-NEXT:    v_readlane_b32 s48, v40, 8
 ; HAWAII-NEXT:    v_readlane_b32 s39, v40, 7
 ; HAWAII-NEXT:    v_readlane_b32 s38, v40, 6
 ; HAWAII-NEXT:    v_readlane_b32 s37, v40, 5
@@ -794,40 +794,40 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s40, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s41, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s42, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s43, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s44, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s45, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s46, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s47, 15
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 16
-; GFX9-NEXT:    s_mov_b32 s42, s15
-; GFX9-NEXT:    s_mov_b32 s43, s14
-; GFX9-NEXT:    s_mov_b32 s44, s13
-; GFX9-NEXT:    s_mov_b32 s45, s12
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s64, 16
+; GFX9-NEXT:    s_mov_b32 s50, s15
+; GFX9-NEXT:    s_mov_b32 s51, s14
+; GFX9-NEXT:    s_mov_b32 s52, s13
+; GFX9-NEXT:    s_mov_b32 s53, s12
 ; GFX9-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GFX9-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GFX9-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GFX9-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX9-NEXT:    s_mov_b64 s[46:47], exec
+; GFX9-NEXT:    s_mov_b64 s[54:55], exec
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s65, 17
 ; GFX9-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s17, v1
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
-; GFX9-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_and_saveexec_b64 s[64:65], vcc
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GFX9-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s45
-; GFX9-NEXT:    s_mov_b32 s13, s44
-; GFX9-NEXT:    s_mov_b32 s14, s43
-; GFX9-NEXT:    s_mov_b32 s15, s42
+; GFX9-NEXT:    s_mov_b32 s12, s53
+; GFX9-NEXT:    s_mov_b32 s13, s52
+; GFX9-NEXT:    s_mov_b32 s14, s51
+; GFX9-NEXT:    s_mov_b32 s15, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -836,21 +836,21 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    ; implicit-def: $vgpr31
 ; GFX9-NEXT:    ; implicit-def: $vgpr2
 ; GFX9-NEXT:    ; implicit-def: $vgpr3
-; GFX9-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GFX9-NEXT:    s_xor_b64 exec, exec, s[64:65]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX9-NEXT:  ; %bb.2:
-; GFX9-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-NEXT:    s_mov_b64 exec, s[54:55]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s47, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s46, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s45, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s44, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s43, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s42, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s41, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s65, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s64, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 8
 ; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s37, v40, 5

diff  --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
index 080bd052a7391..cf23a9d1e8a57 100644
--- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
+++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
@@ -34,57 +34,56 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $sgpr41 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $sgpr35 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr38_sgpr39 = COPY undef $sgpr8_sgpr9
   ; CHECK-NEXT:   renamable $sgpr36_sgpr37 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4)
+  ; CHECK-NEXT:   renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4)
   ; CHECK-NEXT:   dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
-  ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4)
+  ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr48_sgpr49, 0, 0 :: (invariant load (s64), align 16, addrspace 4)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   $vgpr1 = COPY renamable $sgpr51
+  ; CHECK-NEXT:   $vgpr1 = COPY renamable $sgpr55
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   $vcc = COPY renamable $sgpr40_sgpr41
+  ; CHECK-NEXT:   $vcc = COPY renamable $sgpr34_sgpr35
   ; CHECK-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
   ; CHECK-NEXT:   S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_VCCZ %bb.5, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4)
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1)
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], renamable $sgpr50_sgpr51, 0, 0, implicit $exec :: (store (s32), addrspace 1)
-  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr49
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], renamable $sgpr54_sgpr55, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr53
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35
+  ; CHECK-NEXT:   renamable $sgpr10_sgpr11 = IMPLICIT_DEF
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
index dff2bd7f7aef9..7f4f9489ea4b7 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
@@ -53,7 +53,7 @@ body:             |
   bb.0:
     liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-LABEL: name: sgpr_spill_lane_crossover
-    ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr63
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr63
@@ -63,30 +63,14 @@ body:             |
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr69, 5, $vgpr63
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr70, 6, $vgpr63
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr63
-    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 8, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 9, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 10, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 11, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 12, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 13, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 14, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 15, $vgpr63
     ; GCN-NEXT: S_NOP 0
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir
index 9b0f52cb39b01..6e8a5126ca823 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir
@@ -56,20 +56,15 @@ body:             |
   bb.0:
     liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr30_sgpr31, $vcc, $vgpr0
     ; GCN-LABEL: name: spill_exec_copy_reserved_reg
-    ; GCN: liveins: $vcc, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $vgpr0, $vgpr2, $sgpr30_sgpr31
+    ; GCN: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr30_sgpr31, $vcc, $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr28_sgpr29 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr28_sgpr29
-    ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2
-    ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr35, 1, undef $vgpr2
     ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, killed $vgpr0
     ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, killed $vgpr0
-    ; GCN-NEXT: $sgpr34_sgpr35 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $sgpr40_sgpr41 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $sgpr28_sgpr29 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr1 = COPY $vgpr0
     ; GCN-NEXT: S_NOP 0, implicit $sgpr28_sgpr29
-    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr34_sgpr35
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr40_sgpr41
     ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
     ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
     ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8_sgpr9_sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr15, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
index 0d6bccad89d82..fba85455ef693 100644
--- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
@@ -7,141 +7,76 @@ define void @spill_more_than_wavesize_csr_sgprs() {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    v_writelane_b32 v0, s35, 0
 ; CHECK-NEXT:    v_writelane_b32 v0, s36, 1
 ; CHECK-NEXT:    v_writelane_b32 v0, s37, 2
 ; CHECK-NEXT:    v_writelane_b32 v0, s38, 3
 ; CHECK-NEXT:    v_writelane_b32 v0, s39, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s40, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s41, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s42, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s43, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s44, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s45, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s46, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s47, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s48, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s49, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s50, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s51, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s52, 17
-; CHECK-NEXT:    v_writelane_b32 v0, s53, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s54, 19
-; CHECK-NEXT:    v_writelane_b32 v0, s55, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s56, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s57, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s58, 23
-; CHECK-NEXT:    v_writelane_b32 v0, s59, 24
-; CHECK-NEXT:    v_writelane_b32 v0, s60, 25
-; CHECK-NEXT:    v_writelane_b32 v0, s61, 26
-; CHECK-NEXT:    v_writelane_b32 v0, s62, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s63, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s64, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s65, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s66, 31
-; CHECK-NEXT:    v_writelane_b32 v0, s67, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s68, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s69, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s70, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s71, 36
-; CHECK-NEXT:    v_writelane_b32 v0, s72, 37
-; CHECK-NEXT:    v_writelane_b32 v0, s73, 38
-; CHECK-NEXT:    v_writelane_b32 v0, s74, 39
-; CHECK-NEXT:    v_writelane_b32 v0, s75, 40
-; CHECK-NEXT:    v_writelane_b32 v0, s76, 41
-; CHECK-NEXT:    v_writelane_b32 v0, s77, 42
-; CHECK-NEXT:    v_writelane_b32 v0, s78, 43
-; CHECK-NEXT:    v_writelane_b32 v0, s79, 44
-; CHECK-NEXT:    v_writelane_b32 v0, s80, 45
-; CHECK-NEXT:    v_writelane_b32 v0, s81, 46
-; CHECK-NEXT:    v_writelane_b32 v0, s82, 47
-; CHECK-NEXT:    v_writelane_b32 v0, s83, 48
-; CHECK-NEXT:    v_writelane_b32 v0, s84, 49
-; CHECK-NEXT:    v_writelane_b32 v0, s85, 50
-; CHECK-NEXT:    v_writelane_b32 v0, s86, 51
-; CHECK-NEXT:    v_writelane_b32 v0, s87, 52
-; CHECK-NEXT:    v_writelane_b32 v0, s88, 53
-; CHECK-NEXT:    v_writelane_b32 v0, s89, 54
-; CHECK-NEXT:    v_writelane_b32 v0, s90, 55
-; CHECK-NEXT:    v_writelane_b32 v0, s91, 56
-; CHECK-NEXT:    v_writelane_b32 v0, s92, 57
-; CHECK-NEXT:    v_writelane_b32 v0, s93, 58
-; CHECK-NEXT:    v_writelane_b32 v0, s94, 59
-; CHECK-NEXT:    v_writelane_b32 v0, s95, 60
-; CHECK-NEXT:    v_writelane_b32 v1, s99, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s96, 61
-; CHECK-NEXT:    v_writelane_b32 v1, s100, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s97, 62
-; CHECK-NEXT:    v_writelane_b32 v1, s101, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s98, 63
-; CHECK-NEXT:    v_writelane_b32 v1, s102, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s48, 5
+; CHECK-NEXT:    v_writelane_b32 v0, s49, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s50, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s51, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s52, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s53, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s54, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s55, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s64, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s65, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s66, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s67, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s68, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s69, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s70, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s71, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s80, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s81, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s82, 23
+; CHECK-NEXT:    v_writelane_b32 v0, s83, 24
+; CHECK-NEXT:    v_writelane_b32 v0, s84, 25
+; CHECK-NEXT:    v_writelane_b32 v0, s85, 26
+; CHECK-NEXT:    v_writelane_b32 v0, s86, 27
+; CHECK-NEXT:    v_writelane_b32 v0, s87, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s96, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s97, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s98, 31
+; CHECK-NEXT:    v_writelane_b32 v0, s99, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s100, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s101, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s102, 35
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s102, v1, 3
-; CHECK-NEXT:    v_readlane_b32 s101, v1, 2
-; CHECK-NEXT:    v_readlane_b32 s100, v1, 1
-; CHECK-NEXT:    v_readlane_b32 s99, v1, 0
-; CHECK-NEXT:    v_readlane_b32 s98, v0, 63
-; CHECK-NEXT:    v_readlane_b32 s97, v0, 62
-; CHECK-NEXT:    v_readlane_b32 s96, v0, 61
-; CHECK-NEXT:    v_readlane_b32 s95, v0, 60
-; CHECK-NEXT:    v_readlane_b32 s94, v0, 59
-; CHECK-NEXT:    v_readlane_b32 s93, v0, 58
-; CHECK-NEXT:    v_readlane_b32 s92, v0, 57
-; CHECK-NEXT:    v_readlane_b32 s91, v0, 56
-; CHECK-NEXT:    v_readlane_b32 s90, v0, 55
-; CHECK-NEXT:    v_readlane_b32 s89, v0, 54
-; CHECK-NEXT:    v_readlane_b32 s88, v0, 53
-; CHECK-NEXT:    v_readlane_b32 s87, v0, 52
-; CHECK-NEXT:    v_readlane_b32 s86, v0, 51
-; CHECK-NEXT:    v_readlane_b32 s85, v0, 50
-; CHECK-NEXT:    v_readlane_b32 s84, v0, 49
-; CHECK-NEXT:    v_readlane_b32 s83, v0, 48
-; CHECK-NEXT:    v_readlane_b32 s82, v0, 47
-; CHECK-NEXT:    v_readlane_b32 s81, v0, 46
-; CHECK-NEXT:    v_readlane_b32 s80, v0, 45
-; CHECK-NEXT:    v_readlane_b32 s79, v0, 44
-; CHECK-NEXT:    v_readlane_b32 s78, v0, 43
-; CHECK-NEXT:    v_readlane_b32 s77, v0, 42
-; CHECK-NEXT:    v_readlane_b32 s76, v0, 41
-; CHECK-NEXT:    v_readlane_b32 s75, v0, 40
-; CHECK-NEXT:    v_readlane_b32 s74, v0, 39
-; CHECK-NEXT:    v_readlane_b32 s73, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s72, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s71, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s70, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s69, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s68, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s67, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s66, v0, 31
-; CHECK-NEXT:    v_readlane_b32 s65, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s64, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s63, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s62, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s61, v0, 26
-; CHECK-NEXT:    v_readlane_b32 s60, v0, 25
-; CHECK-NEXT:    v_readlane_b32 s59, v0, 24
-; CHECK-NEXT:    v_readlane_b32 s58, v0, 23
-; CHECK-NEXT:    v_readlane_b32 s57, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s56, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s55, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s54, v0, 19
-; CHECK-NEXT:    v_readlane_b32 s53, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s52, v0, 17
-; CHECK-NEXT:    v_readlane_b32 s51, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s50, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s49, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s48, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s47, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s46, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s45, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s44, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s43, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s42, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s41, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s40, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s102, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s101, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s100, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s99, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s98, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s97, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s96, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s87, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s86, v0, 27
+; CHECK-NEXT:    v_readlane_b32 s85, v0, 26
+; CHECK-NEXT:    v_readlane_b32 s84, v0, 25
+; CHECK-NEXT:    v_readlane_b32 s83, v0, 24
+; CHECK-NEXT:    v_readlane_b32 s82, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s81, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s80, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s71, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s70, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s69, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s68, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s67, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s66, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s65, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s64, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s55, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s54, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s53, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s52, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s51, v0, 8
+; CHECK-NEXT:    v_readlane_b32 s50, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s49, v0, 6
+; CHECK-NEXT:    v_readlane_b32 s48, v0, 5
 ; CHECK-NEXT:    v_readlane_b32 s39, v0, 4
 ; CHECK-NEXT:    v_readlane_b32 s38, v0, 3
 ; CHECK-NEXT:    v_readlane_b32 s37, v0, 2
@@ -149,7 +84,6 @@ define void @spill_more_than_wavesize_csr_sgprs() {
 ; CHECK-NEXT:    v_readlane_b32 s35, v0, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -172,144 +106,79 @@ define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    v_writelane_b32 v1, s35, 0
 ; CHECK-NEXT:    v_writelane_b32 v1, s36, 1
 ; CHECK-NEXT:    v_writelane_b32 v1, s37, 2
 ; CHECK-NEXT:    v_writelane_b32 v1, s38, 3
 ; CHECK-NEXT:    v_writelane_b32 v1, s39, 4
-; CHECK-NEXT:    v_writelane_b32 v1, s40, 5
-; CHECK-NEXT:    v_writelane_b32 v1, s41, 6
-; CHECK-NEXT:    v_writelane_b32 v1, s42, 7
-; CHECK-NEXT:    v_writelane_b32 v1, s43, 8
-; CHECK-NEXT:    v_writelane_b32 v1, s44, 9
-; CHECK-NEXT:    v_writelane_b32 v1, s45, 10
-; CHECK-NEXT:    v_writelane_b32 v1, s46, 11
-; CHECK-NEXT:    v_writelane_b32 v1, s47, 12
-; CHECK-NEXT:    v_writelane_b32 v1, s48, 13
-; CHECK-NEXT:    v_writelane_b32 v1, s49, 14
-; CHECK-NEXT:    v_writelane_b32 v1, s50, 15
-; CHECK-NEXT:    v_writelane_b32 v1, s51, 16
-; CHECK-NEXT:    v_writelane_b32 v1, s52, 17
-; CHECK-NEXT:    v_writelane_b32 v1, s53, 18
-; CHECK-NEXT:    v_writelane_b32 v1, s54, 19
-; CHECK-NEXT:    v_writelane_b32 v1, s55, 20
-; CHECK-NEXT:    v_writelane_b32 v1, s56, 21
-; CHECK-NEXT:    v_writelane_b32 v1, s57, 22
-; CHECK-NEXT:    v_writelane_b32 v1, s58, 23
-; CHECK-NEXT:    v_writelane_b32 v1, s59, 24
-; CHECK-NEXT:    v_writelane_b32 v1, s60, 25
-; CHECK-NEXT:    v_writelane_b32 v1, s61, 26
-; CHECK-NEXT:    v_writelane_b32 v1, s62, 27
-; CHECK-NEXT:    v_writelane_b32 v1, s63, 28
-; CHECK-NEXT:    v_writelane_b32 v1, s64, 29
-; CHECK-NEXT:    v_writelane_b32 v1, s65, 30
-; CHECK-NEXT:    v_writelane_b32 v1, s66, 31
-; CHECK-NEXT:    v_writelane_b32 v1, s67, 32
-; CHECK-NEXT:    v_writelane_b32 v1, s68, 33
-; CHECK-NEXT:    v_writelane_b32 v1, s69, 34
-; CHECK-NEXT:    v_writelane_b32 v1, s70, 35
-; CHECK-NEXT:    v_writelane_b32 v1, s71, 36
-; CHECK-NEXT:    v_writelane_b32 v1, s72, 37
-; CHECK-NEXT:    v_writelane_b32 v1, s73, 38
-; CHECK-NEXT:    v_writelane_b32 v1, s74, 39
-; CHECK-NEXT:    v_writelane_b32 v1, s75, 40
-; CHECK-NEXT:    v_writelane_b32 v1, s76, 41
-; CHECK-NEXT:    v_writelane_b32 v1, s77, 42
-; CHECK-NEXT:    v_writelane_b32 v1, s78, 43
-; CHECK-NEXT:    v_writelane_b32 v1, s79, 44
-; CHECK-NEXT:    v_writelane_b32 v1, s80, 45
-; CHECK-NEXT:    v_writelane_b32 v1, s81, 46
-; CHECK-NEXT:    v_writelane_b32 v1, s82, 47
-; CHECK-NEXT:    v_writelane_b32 v1, s83, 48
-; CHECK-NEXT:    v_writelane_b32 v1, s84, 49
-; CHECK-NEXT:    v_writelane_b32 v1, s85, 50
-; CHECK-NEXT:    v_writelane_b32 v1, s86, 51
-; CHECK-NEXT:    v_writelane_b32 v1, s87, 52
-; CHECK-NEXT:    v_writelane_b32 v1, s88, 53
-; CHECK-NEXT:    v_writelane_b32 v1, s89, 54
-; CHECK-NEXT:    v_writelane_b32 v1, s90, 55
-; CHECK-NEXT:    v_writelane_b32 v1, s91, 56
-; CHECK-NEXT:    v_writelane_b32 v1, s92, 57
-; CHECK-NEXT:    v_writelane_b32 v1, s93, 58
-; CHECK-NEXT:    v_writelane_b32 v1, s94, 59
-; CHECK-NEXT:    v_writelane_b32 v1, s95, 60
-; CHECK-NEXT:    v_writelane_b32 v2, s99, 0
-; CHECK-NEXT:    v_writelane_b32 v1, s96, 61
-; CHECK-NEXT:    v_writelane_b32 v2, s100, 1
-; CHECK-NEXT:    v_writelane_b32 v1, s97, 62
-; CHECK-NEXT:    v_writelane_b32 v2, s101, 2
+; CHECK-NEXT:    v_writelane_b32 v1, s48, 5
+; CHECK-NEXT:    v_writelane_b32 v1, s49, 6
+; CHECK-NEXT:    v_writelane_b32 v1, s50, 7
+; CHECK-NEXT:    v_writelane_b32 v1, s51, 8
+; CHECK-NEXT:    v_writelane_b32 v1, s52, 9
+; CHECK-NEXT:    v_writelane_b32 v1, s53, 10
+; CHECK-NEXT:    v_writelane_b32 v1, s54, 11
+; CHECK-NEXT:    v_writelane_b32 v1, s55, 12
+; CHECK-NEXT:    v_writelane_b32 v1, s64, 13
+; CHECK-NEXT:    v_writelane_b32 v1, s65, 14
+; CHECK-NEXT:    v_writelane_b32 v1, s66, 15
+; CHECK-NEXT:    v_writelane_b32 v1, s67, 16
+; CHECK-NEXT:    v_writelane_b32 v1, s68, 17
+; CHECK-NEXT:    v_writelane_b32 v1, s69, 18
+; CHECK-NEXT:    v_writelane_b32 v1, s70, 19
+; CHECK-NEXT:    v_writelane_b32 v1, s71, 20
+; CHECK-NEXT:    v_writelane_b32 v1, s80, 21
+; CHECK-NEXT:    v_writelane_b32 v1, s81, 22
+; CHECK-NEXT:    v_writelane_b32 v1, s82, 23
+; CHECK-NEXT:    v_writelane_b32 v1, s83, 24
+; CHECK-NEXT:    v_writelane_b32 v1, s84, 25
+; CHECK-NEXT:    v_writelane_b32 v1, s85, 26
+; CHECK-NEXT:    v_writelane_b32 v1, s86, 27
+; CHECK-NEXT:    v_writelane_b32 v1, s87, 28
+; CHECK-NEXT:    v_writelane_b32 v1, s96, 29
+; CHECK-NEXT:    v_writelane_b32 v1, s97, 30
+; CHECK-NEXT:    v_writelane_b32 v1, s98, 31
+; CHECK-NEXT:    v_writelane_b32 v1, s99, 32
+; CHECK-NEXT:    v_writelane_b32 v1, s100, 33
+; CHECK-NEXT:    v_writelane_b32 v1, s101, 34
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_writelane_b32 v1, s98, 63
-; CHECK-NEXT:    v_writelane_b32 v2, s102, 3
+; CHECK-NEXT:    v_writelane_b32 v1, s102, 35
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s102, v2, 3
-; CHECK-NEXT:    v_readlane_b32 s101, v2, 2
-; CHECK-NEXT:    v_readlane_b32 s100, v2, 1
-; CHECK-NEXT:    v_readlane_b32 s99, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s98, v1, 63
-; CHECK-NEXT:    v_readlane_b32 s97, v1, 62
-; CHECK-NEXT:    v_readlane_b32 s96, v1, 61
-; CHECK-NEXT:    v_readlane_b32 s95, v1, 60
-; CHECK-NEXT:    v_readlane_b32 s94, v1, 59
-; CHECK-NEXT:    v_readlane_b32 s93, v1, 58
-; CHECK-NEXT:    v_readlane_b32 s92, v1, 57
-; CHECK-NEXT:    v_readlane_b32 s91, v1, 56
-; CHECK-NEXT:    v_readlane_b32 s90, v1, 55
-; CHECK-NEXT:    v_readlane_b32 s89, v1, 54
-; CHECK-NEXT:    v_readlane_b32 s88, v1, 53
-; CHECK-NEXT:    v_readlane_b32 s87, v1, 52
-; CHECK-NEXT:    v_readlane_b32 s86, v1, 51
-; CHECK-NEXT:    v_readlane_b32 s85, v1, 50
-; CHECK-NEXT:    v_readlane_b32 s84, v1, 49
-; CHECK-NEXT:    v_readlane_b32 s83, v1, 48
-; CHECK-NEXT:    v_readlane_b32 s82, v1, 47
-; CHECK-NEXT:    v_readlane_b32 s81, v1, 46
-; CHECK-NEXT:    v_readlane_b32 s80, v1, 45
-; CHECK-NEXT:    v_readlane_b32 s79, v1, 44
-; CHECK-NEXT:    v_readlane_b32 s78, v1, 43
-; CHECK-NEXT:    v_readlane_b32 s77, v1, 42
-; CHECK-NEXT:    v_readlane_b32 s76, v1, 41
-; CHECK-NEXT:    v_readlane_b32 s75, v1, 40
-; CHECK-NEXT:    v_readlane_b32 s74, v1, 39
-; CHECK-NEXT:    v_readlane_b32 s73, v1, 38
-; CHECK-NEXT:    v_readlane_b32 s72, v1, 37
-; CHECK-NEXT:    v_readlane_b32 s71, v1, 36
-; CHECK-NEXT:    v_readlane_b32 s70, v1, 35
-; CHECK-NEXT:    v_readlane_b32 s69, v1, 34
-; CHECK-NEXT:    v_readlane_b32 s68, v1, 33
-; CHECK-NEXT:    v_readlane_b32 s67, v1, 32
-; CHECK-NEXT:    v_readlane_b32 s66, v1, 31
-; CHECK-NEXT:    v_readlane_b32 s65, v1, 30
-; CHECK-NEXT:    v_readlane_b32 s64, v1, 29
-; CHECK-NEXT:    v_readlane_b32 s63, v1, 28
-; CHECK-NEXT:    v_readlane_b32 s62, v1, 27
-; CHECK-NEXT:    v_readlane_b32 s61, v1, 26
-; CHECK-NEXT:    v_readlane_b32 s60, v1, 25
-; CHECK-NEXT:    v_readlane_b32 s59, v1, 24
-; CHECK-NEXT:    v_readlane_b32 s58, v1, 23
-; CHECK-NEXT:    v_readlane_b32 s57, v1, 22
-; CHECK-NEXT:    v_readlane_b32 s56, v1, 21
-; CHECK-NEXT:    v_readlane_b32 s55, v1, 20
-; CHECK-NEXT:    v_readlane_b32 s54, v1, 19
-; CHECK-NEXT:    v_readlane_b32 s53, v1, 18
-; CHECK-NEXT:    v_readlane_b32 s52, v1, 17
-; CHECK-NEXT:    v_readlane_b32 s51, v1, 16
-; CHECK-NEXT:    v_readlane_b32 s50, v1, 15
-; CHECK-NEXT:    v_readlane_b32 s49, v1, 14
-; CHECK-NEXT:    v_readlane_b32 s48, v1, 13
-; CHECK-NEXT:    v_readlane_b32 s47, v1, 12
-; CHECK-NEXT:    v_readlane_b32 s46, v1, 11
-; CHECK-NEXT:    v_readlane_b32 s45, v1, 10
-; CHECK-NEXT:    v_readlane_b32 s44, v1, 9
-; CHECK-NEXT:    v_readlane_b32 s43, v1, 8
-; CHECK-NEXT:    v_readlane_b32 s42, v1, 7
-; CHECK-NEXT:    v_readlane_b32 s41, v1, 6
-; CHECK-NEXT:    v_readlane_b32 s40, v1, 5
+; CHECK-NEXT:    v_readlane_b32 s102, v1, 35
+; CHECK-NEXT:    v_readlane_b32 s101, v1, 34
+; CHECK-NEXT:    v_readlane_b32 s100, v1, 33
+; CHECK-NEXT:    v_readlane_b32 s99, v1, 32
+; CHECK-NEXT:    v_readlane_b32 s98, v1, 31
+; CHECK-NEXT:    v_readlane_b32 s97, v1, 30
+; CHECK-NEXT:    v_readlane_b32 s96, v1, 29
+; CHECK-NEXT:    v_readlane_b32 s87, v1, 28
+; CHECK-NEXT:    v_readlane_b32 s86, v1, 27
+; CHECK-NEXT:    v_readlane_b32 s85, v1, 26
+; CHECK-NEXT:    v_readlane_b32 s84, v1, 25
+; CHECK-NEXT:    v_readlane_b32 s83, v1, 24
+; CHECK-NEXT:    v_readlane_b32 s82, v1, 23
+; CHECK-NEXT:    v_readlane_b32 s81, v1, 22
+; CHECK-NEXT:    v_readlane_b32 s80, v1, 21
+; CHECK-NEXT:    v_readlane_b32 s71, v1, 20
+; CHECK-NEXT:    v_readlane_b32 s70, v1, 19
+; CHECK-NEXT:    v_readlane_b32 s69, v1, 18
+; CHECK-NEXT:    v_readlane_b32 s68, v1, 17
+; CHECK-NEXT:    v_readlane_b32 s67, v1, 16
+; CHECK-NEXT:    v_readlane_b32 s66, v1, 15
+; CHECK-NEXT:    v_readlane_b32 s65, v1, 14
+; CHECK-NEXT:    v_readlane_b32 s64, v1, 13
+; CHECK-NEXT:    v_readlane_b32 s55, v1, 12
+; CHECK-NEXT:    v_readlane_b32 s54, v1, 11
+; CHECK-NEXT:    v_readlane_b32 s53, v1, 10
+; CHECK-NEXT:    v_readlane_b32 s52, v1, 9
+; CHECK-NEXT:    v_readlane_b32 s51, v1, 8
+; CHECK-NEXT:    v_readlane_b32 s50, v1, 7
+; CHECK-NEXT:    v_readlane_b32 s49, v1, 6
+; CHECK-NEXT:    v_readlane_b32 s48, v1, 5
 ; CHECK-NEXT:    v_readlane_b32 s39, v1, 4
 ; CHECK-NEXT:    v_readlane_b32 s38, v1, 3
 ; CHECK-NEXT:    v_readlane_b32 s37, v1, 2
@@ -317,7 +186,6 @@ define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
 ; CHECK-NEXT:    v_readlane_b32 s35, v1, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
index 8f53ec2f992da..359152e9d2b45 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
@@ -17,70 +17,78 @@ body:             |
   ; RA-NEXT:   [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; RA-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_1024 = S_MOV_B32 -1
   ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 -1
+  ; RA-NEXT:   SI_SPILL_S1024_SAVE [[S_MOV_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5)
   ; RA-NEXT:   undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 0
+  ; RA-NEXT:   SI_SPILL_S1024_SAVE [[S_MOV_B32_1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5)
   ; RA-NEXT: {{  $}}
   ; RA-NEXT: bb.1:
   ; RA-NEXT:   successors: %bb.2(0x80000000)
   ; RA-NEXT: {{  $}}
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub1:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub30:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
-  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub31:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[SI_SPILL_S1024_RESTORE:%[0-9]+]]:sgpr_1024 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5)
+  ; RA-NEXT:   undef [[COPY:%[0-9]+]].sub0_sub1:sgpr_1024 = COPY [[SI_SPILL_S1024_RESTORE]].sub0_sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub2:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub3:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub4:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub5:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub6:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub7:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub8:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub9:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub10:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub11:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub12:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub13:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub14:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub15:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub16:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub17:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub18:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub19:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub20:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub21:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub22:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub23:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub24:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub25:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub26:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub27:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub28:sgpr_1024 = COPY [[COPY]].sub0
+  ; RA-NEXT:   [[COPY:%[0-9]+]].sub29:sgpr_1024 = COPY [[COPY]].sub1
+  ; RA-NEXT:   SI_SPILL_S1024_SAVE [[COPY]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; RA-NEXT:   [[SI_SPILL_S1024_RESTORE1:%[0-9]+]]:sgpr_1024 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; RA-NEXT:   undef [[COPY1:%[0-9]+]].sub0:sgpr_1024 = COPY [[SI_SPILL_S1024_RESTORE1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub1:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub2:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub3:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub4:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub5:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub6:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub7:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub8:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub9:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub10:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub11:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub12:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub13:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub14:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub15:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub16:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub17:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub18:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub19:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub20:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub21:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub22:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub23:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub24:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub25:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub26:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub27:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub28:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub29:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub30:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   [[COPY1:%[0-9]+]].sub31:sgpr_1024 = COPY [[COPY1]].sub0
+  ; RA-NEXT:   SI_SPILL_S1024_SAVE [[COPY1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5)
   ; RA-NEXT: {{  $}}
   ; RA-NEXT: bb.2:
   ; RA-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -95,14 +103,17 @@ body:             |
   ; VR-NEXT: {{  $}}
   ; VR-NEXT:   renamable $sgpr37 = S_MOV_B32 -1
   ; VR-NEXT:   renamable $sgpr36 = S_MOV_B32 -1
-  ; VR-NEXT:   renamable $sgpr68 = S_MOV_B32 0
+  ; VR-NEXT:   SI_SPILL_S1024_SAVE renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; VR-NEXT:   renamable $sgpr36 = S_MOV_B32 0
   ; VR-NEXT:   renamable $sgpr30_sgpr31 = IMPLICIT_DEF
   ; VR-NEXT:   renamable $sgpr34_sgpr35 = IMPLICIT_DEF
+  ; VR-NEXT:   SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5)
   ; VR-NEXT: {{  $}}
   ; VR-NEXT: bb.1:
   ; VR-NEXT:   successors: %bb.2(0x80000000)
-  ; VR-NEXT:   liveins: $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003
+  ; VR-NEXT:   liveins: $sgpr30_sgpr31, $sgpr34_sgpr35
   ; VR-NEXT: {{  $}}
+  ; VR-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5)
   ; VR-NEXT:   renamable $sgpr38 = COPY renamable $sgpr36
   ; VR-NEXT:   renamable $sgpr39 = COPY renamable $sgpr37
   ; VR-NEXT:   renamable $sgpr40 = COPY renamable $sgpr36
@@ -131,41 +142,44 @@ body:             |
   ; VR-NEXT:   renamable $sgpr63 = COPY renamable $sgpr37
   ; VR-NEXT:   renamable $sgpr64 = COPY renamable $sgpr36
   ; VR-NEXT:   renamable $sgpr65 = COPY renamable $sgpr37
-  ; VR-NEXT:   renamable $sgpr69 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr70 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr71 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr72 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr73 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr74 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr75 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr76 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr77 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr78 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr79 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr80 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr81 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr82 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr83 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr84 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr85 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr86 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr87 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr88 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr89 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr90 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr91 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr92 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr93 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr94 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr95 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr96 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr97 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr98 = COPY renamable $sgpr68
-  ; VR-NEXT:   renamable $sgpr99 = COPY renamable $sgpr68
+  ; VR-NEXT:   SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; VR-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; VR-NEXT:   renamable $sgpr37 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr38 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr39 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr40 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr41 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr42 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr43 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr44 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr45 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr46 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr47 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr48 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr49 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr50 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr51 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr52 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr53 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr54 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr55 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr56 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr57 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr58 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr59 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr60 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr61 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr62 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr63 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr64 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr65 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr66 = COPY renamable $sgpr36
+  ; VR-NEXT:   renamable $sgpr67 = COPY renamable $sgpr36
+  ; VR-NEXT:   SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5)
   ; VR-NEXT: {{  $}}
   ; VR-NEXT: bb.2:
   ; VR-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; VR-NEXT:   liveins: $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003
+  ; VR-NEXT:   liveins: $sgpr30_sgpr31, $sgpr34_sgpr35
   ; VR-NEXT: {{  $}}
   ; VR-NEXT:   S_NOP 0, csr_amdgpu, implicit renamable $sgpr30_sgpr31, implicit renamable $sgpr34_sgpr35
   ; VR-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 0e568e3071e99..4ddde7f297172 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; Check that we properly realign the stack. While 4-byte access is all
@@ -415,28 +416,21 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 {
 ; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 vcc_lo, s33
-; GCN-NEXT:    s_add_i32 s33, s32, 0x1fc0
-; GCN-NEXT:    s_and_b32 s33, s33, 0xffffe000
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v1, s34, 0
+; GCN-NEXT:    s_mov_b32 s41, s34
 ; GCN-NEXT:    s_mov_b32 s34, s32
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s34 offset:4
+; GCN-NEXT:    s_mov_b32 s40, s33
+; GCN-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GCN-NEXT:    s_and_b32 s33, s33, 0xffffe000
 ; GCN-NEXT:    s_addk_i32 s32, 0x6000
 ; GCN-NEXT:    s_mov_b32 s32, s34
-; GCN-NEXT:    v_readlane_b32 s34, v1, 0
+; GCN-NEXT:    s_mov_b32 s34, s41
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:128
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_mov_b32 s33, vcc_lo
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b32 s33, s40
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %local_val = alloca i32, align 128, addrspace(5)
   store volatile i32 %b, ptr addrspace(5) %local_val, align 128
@@ -460,78 +454,45 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 {
 ; GCN-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
-; GCN-NEXT:    v_mov_b32_e32 v0, s34
+; GCN-NEXT:    v_writelane_b32 v39, s4, 32
+; GCN-NEXT:    v_writelane_b32 v39, s34, 33
 ; GCN-NEXT:    s_mov_b32 s34, s32
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s34 offset:4
 ; GCN-NEXT:    v_writelane_b32 v39, s39, 0
-; GCN-NEXT:    v_writelane_b32 v39, s40, 1
-; GCN-NEXT:    v_writelane_b32 v39, s41, 2
-; GCN-NEXT:    v_writelane_b32 v39, s42, 3
-; GCN-NEXT:    v_writelane_b32 v39, s43, 4
-; GCN-NEXT:    v_writelane_b32 v39, s44, 5
-; GCN-NEXT:    v_writelane_b32 v39, s45, 6
-; GCN-NEXT:    v_writelane_b32 v39, s46, 7
-; GCN-NEXT:    v_writelane_b32 v39, s47, 8
-; GCN-NEXT:    v_writelane_b32 v39, s48, 9
-; GCN-NEXT:    v_writelane_b32 v39, s49, 10
-; GCN-NEXT:    v_writelane_b32 v39, s50, 11
-; GCN-NEXT:    v_writelane_b32 v39, s51, 12
-; GCN-NEXT:    v_writelane_b32 v39, s52, 13
-; GCN-NEXT:    v_writelane_b32 v39, s53, 14
-; GCN-NEXT:    v_writelane_b32 v39, s54, 15
-; GCN-NEXT:    v_writelane_b32 v39, s55, 16
-; GCN-NEXT:    v_writelane_b32 v39, s56, 17
-; GCN-NEXT:    v_writelane_b32 v39, s57, 18
-; GCN-NEXT:    v_writelane_b32 v39, s58, 19
-; GCN-NEXT:    v_writelane_b32 v39, s59, 20
-; GCN-NEXT:    v_writelane_b32 v39, s60, 21
-; GCN-NEXT:    v_writelane_b32 v39, s61, 22
-; GCN-NEXT:    v_writelane_b32 v39, s62, 23
-; GCN-NEXT:    v_writelane_b32 v39, s63, 24
-; GCN-NEXT:    v_writelane_b32 v39, s64, 25
-; GCN-NEXT:    v_writelane_b32 v39, s65, 26
-; GCN-NEXT:    v_writelane_b32 v39, s66, 27
-; GCN-NEXT:    v_writelane_b32 v39, s67, 28
-; GCN-NEXT:    v_writelane_b32 v39, s68, 29
-; GCN-NEXT:    v_writelane_b32 v39, s69, 30
-; GCN-NEXT:    v_writelane_b32 v39, s70, 31
-; GCN-NEXT:    v_writelane_b32 v39, s71, 32
-; GCN-NEXT:    v_writelane_b32 v39, s72, 33
-; GCN-NEXT:    v_writelane_b32 v39, s73, 34
-; GCN-NEXT:    v_writelane_b32 v39, s74, 35
-; GCN-NEXT:    v_writelane_b32 v39, s75, 36
-; GCN-NEXT:    v_writelane_b32 v39, s76, 37
-; GCN-NEXT:    v_writelane_b32 v39, s77, 38
-; GCN-NEXT:    v_writelane_b32 v39, s78, 39
-; GCN-NEXT:    v_writelane_b32 v39, s79, 40
-; GCN-NEXT:    v_writelane_b32 v39, s80, 41
-; GCN-NEXT:    v_writelane_b32 v39, s81, 42
-; GCN-NEXT:    v_writelane_b32 v39, s82, 43
-; GCN-NEXT:    v_writelane_b32 v39, s83, 44
-; GCN-NEXT:    v_writelane_b32 v39, s84, 45
-; GCN-NEXT:    v_writelane_b32 v39, s85, 46
-; GCN-NEXT:    v_writelane_b32 v39, s86, 47
-; GCN-NEXT:    v_writelane_b32 v39, s87, 48
-; GCN-NEXT:    v_writelane_b32 v39, s88, 49
-; GCN-NEXT:    v_writelane_b32 v39, s89, 50
-; GCN-NEXT:    v_writelane_b32 v39, s90, 51
-; GCN-NEXT:    v_writelane_b32 v39, s91, 52
-; GCN-NEXT:    v_writelane_b32 v39, s92, 53
-; GCN-NEXT:    v_writelane_b32 v39, s93, 54
-; GCN-NEXT:    v_writelane_b32 v39, s94, 55
-; GCN-NEXT:    v_writelane_b32 v39, s95, 56
-; GCN-NEXT:    v_writelane_b32 v39, s96, 57
-; GCN-NEXT:    v_writelane_b32 v39, s97, 58
-; GCN-NEXT:    v_writelane_b32 v39, s98, 59
-; GCN-NEXT:    v_writelane_b32 v39, s99, 60
-; GCN-NEXT:    v_writelane_b32 v39, s100, 61
-; GCN-NEXT:    v_writelane_b32 v39, s101, 62
-; GCN-NEXT:    v_writelane_b32 v39, s102, 63
+; GCN-NEXT:    v_writelane_b32 v39, s48, 1
+; GCN-NEXT:    v_writelane_b32 v39, s49, 2
+; GCN-NEXT:    v_writelane_b32 v39, s50, 3
+; GCN-NEXT:    v_writelane_b32 v39, s51, 4
+; GCN-NEXT:    v_writelane_b32 v39, s52, 5
+; GCN-NEXT:    v_writelane_b32 v39, s53, 6
+; GCN-NEXT:    v_writelane_b32 v39, s54, 7
+; GCN-NEXT:    v_writelane_b32 v39, s55, 8
+; GCN-NEXT:    v_writelane_b32 v39, s64, 9
+; GCN-NEXT:    v_writelane_b32 v39, s65, 10
+; GCN-NEXT:    v_writelane_b32 v39, s66, 11
+; GCN-NEXT:    v_writelane_b32 v39, s67, 12
+; GCN-NEXT:    v_writelane_b32 v39, s68, 13
+; GCN-NEXT:    v_writelane_b32 v39, s69, 14
+; GCN-NEXT:    v_writelane_b32 v39, s70, 15
+; GCN-NEXT:    v_writelane_b32 v39, s71, 16
+; GCN-NEXT:    v_writelane_b32 v39, s80, 17
+; GCN-NEXT:    v_writelane_b32 v39, s81, 18
+; GCN-NEXT:    v_writelane_b32 v39, s82, 19
+; GCN-NEXT:    v_writelane_b32 v39, s83, 20
+; GCN-NEXT:    v_writelane_b32 v39, s84, 21
+; GCN-NEXT:    v_writelane_b32 v39, s85, 22
+; GCN-NEXT:    v_writelane_b32 v39, s86, 23
+; GCN-NEXT:    v_writelane_b32 v39, s87, 24
+; GCN-NEXT:    v_writelane_b32 v39, s96, 25
+; GCN-NEXT:    v_writelane_b32 v39, s97, 26
+; GCN-NEXT:    v_writelane_b32 v39, s98, 27
+; GCN-NEXT:    v_writelane_b32 v39, s99, 28
+; GCN-NEXT:    v_writelane_b32 v39, s100, 29
+; GCN-NEXT:    v_writelane_b32 v39, s101, 30
 ; GCN-NEXT:    s_addk_i32 s32, 0x6000
+; GCN-NEXT:    v_writelane_b32 v39, s102, 31
 ; GCN-NEXT:    s_mov_b32 s32, s34
+; GCN-NEXT:    v_readlane_b32 s34, v39, 33
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:128
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -541,76 +502,39 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 {
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; clobber all VGPRs
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s102, v39, 63
-; GCN-NEXT:    v_readlane_b32 s101, v39, 62
-; GCN-NEXT:    v_readlane_b32 s100, v39, 61
-; GCN-NEXT:    v_readlane_b32 s99, v39, 60
-; GCN-NEXT:    v_readlane_b32 s98, v39, 59
-; GCN-NEXT:    v_readlane_b32 s97, v39, 58
-; GCN-NEXT:    v_readlane_b32 s96, v39, 57
-; GCN-NEXT:    v_readlane_b32 s95, v39, 56
-; GCN-NEXT:    v_readlane_b32 s94, v39, 55
-; GCN-NEXT:    v_readlane_b32 s93, v39, 54
-; GCN-NEXT:    v_readlane_b32 s92, v39, 53
-; GCN-NEXT:    v_readlane_b32 s91, v39, 52
-; GCN-NEXT:    v_readlane_b32 s90, v39, 51
-; GCN-NEXT:    v_readlane_b32 s89, v39, 50
-; GCN-NEXT:    v_readlane_b32 s88, v39, 49
-; GCN-NEXT:    v_readlane_b32 s87, v39, 48
-; GCN-NEXT:    v_readlane_b32 s86, v39, 47
-; GCN-NEXT:    v_readlane_b32 s85, v39, 46
-; GCN-NEXT:    v_readlane_b32 s84, v39, 45
-; GCN-NEXT:    v_readlane_b32 s83, v39, 44
-; GCN-NEXT:    v_readlane_b32 s82, v39, 43
-; GCN-NEXT:    v_readlane_b32 s81, v39, 42
-; GCN-NEXT:    v_readlane_b32 s80, v39, 41
-; GCN-NEXT:    v_readlane_b32 s79, v39, 40
-; GCN-NEXT:    v_readlane_b32 s78, v39, 39
-; GCN-NEXT:    v_readlane_b32 s77, v39, 38
-; GCN-NEXT:    v_readlane_b32 s76, v39, 37
-; GCN-NEXT:    v_readlane_b32 s75, v39, 36
-; GCN-NEXT:    v_readlane_b32 s74, v39, 35
-; GCN-NEXT:    v_readlane_b32 s73, v39, 34
-; GCN-NEXT:    v_readlane_b32 s72, v39, 33
-; GCN-NEXT:    v_readlane_b32 s71, v39, 32
-; GCN-NEXT:    v_readlane_b32 s70, v39, 31
-; GCN-NEXT:    v_readlane_b32 s69, v39, 30
-; GCN-NEXT:    v_readlane_b32 s68, v39, 29
-; GCN-NEXT:    v_readlane_b32 s67, v39, 28
-; GCN-NEXT:    v_readlane_b32 s66, v39, 27
-; GCN-NEXT:    v_readlane_b32 s65, v39, 26
-; GCN-NEXT:    v_readlane_b32 s64, v39, 25
-; GCN-NEXT:    v_readlane_b32 s63, v39, 24
-; GCN-NEXT:    v_readlane_b32 s62, v39, 23
-; GCN-NEXT:    v_readlane_b32 s61, v39, 22
-; GCN-NEXT:    v_readlane_b32 s60, v39, 21
-; GCN-NEXT:    v_readlane_b32 s59, v39, 20
-; GCN-NEXT:    v_readlane_b32 s58, v39, 19
-; GCN-NEXT:    v_readlane_b32 s57, v39, 18
-; GCN-NEXT:    v_readlane_b32 s56, v39, 17
-; GCN-NEXT:    v_readlane_b32 s55, v39, 16
-; GCN-NEXT:    v_readlane_b32 s54, v39, 15
-; GCN-NEXT:    v_readlane_b32 s53, v39, 14
-; GCN-NEXT:    v_readlane_b32 s52, v39, 13
-; GCN-NEXT:    v_readlane_b32 s51, v39, 12
-; GCN-NEXT:    v_readlane_b32 s50, v39, 11
-; GCN-NEXT:    v_readlane_b32 s49, v39, 10
-; GCN-NEXT:    v_readlane_b32 s48, v39, 9
-; GCN-NEXT:    v_readlane_b32 s47, v39, 8
-; GCN-NEXT:    v_readlane_b32 s46, v39, 7
-; GCN-NEXT:    v_readlane_b32 s45, v39, 6
-; GCN-NEXT:    v_readlane_b32 s44, v39, 5
-; GCN-NEXT:    v_readlane_b32 s43, v39, 4
-; GCN-NEXT:    v_readlane_b32 s42, v39, 3
-; GCN-NEXT:    v_readlane_b32 s41, v39, 2
-; GCN-NEXT:    v_readlane_b32 s40, v39, 1
+; GCN-NEXT:    v_readlane_b32 s102, v39, 31
+; GCN-NEXT:    v_readlane_b32 s101, v39, 30
+; GCN-NEXT:    v_readlane_b32 s100, v39, 29
+; GCN-NEXT:    v_readlane_b32 s99, v39, 28
+; GCN-NEXT:    v_readlane_b32 s98, v39, 27
+; GCN-NEXT:    v_readlane_b32 s97, v39, 26
+; GCN-NEXT:    v_readlane_b32 s96, v39, 25
+; GCN-NEXT:    v_readlane_b32 s87, v39, 24
+; GCN-NEXT:    v_readlane_b32 s86, v39, 23
+; GCN-NEXT:    v_readlane_b32 s85, v39, 22
+; GCN-NEXT:    v_readlane_b32 s84, v39, 21
+; GCN-NEXT:    v_readlane_b32 s83, v39, 20
+; GCN-NEXT:    v_readlane_b32 s82, v39, 19
+; GCN-NEXT:    v_readlane_b32 s81, v39, 18
+; GCN-NEXT:    v_readlane_b32 s80, v39, 17
+; GCN-NEXT:    v_readlane_b32 s71, v39, 16
+; GCN-NEXT:    v_readlane_b32 s70, v39, 15
+; GCN-NEXT:    v_readlane_b32 s69, v39, 14
+; GCN-NEXT:    v_readlane_b32 s68, v39, 13
+; GCN-NEXT:    v_readlane_b32 s67, v39, 12
+; GCN-NEXT:    v_readlane_b32 s66, v39, 11
+; GCN-NEXT:    v_readlane_b32 s65, v39, 10
+; GCN-NEXT:    v_readlane_b32 s64, v39, 9
+; GCN-NEXT:    v_readlane_b32 s55, v39, 8
+; GCN-NEXT:    v_readlane_b32 s54, v39, 7
+; GCN-NEXT:    v_readlane_b32 s53, v39, 6
+; GCN-NEXT:    v_readlane_b32 s52, v39, 5
+; GCN-NEXT:    v_readlane_b32 s51, v39, 4
+; GCN-NEXT:    v_readlane_b32 s50, v39, 3
+; GCN-NEXT:    v_readlane_b32 s49, v39, 2
+; GCN-NEXT:    v_readlane_b32 s48, v39, 1
 ; GCN-NEXT:    v_readlane_b32 s39, v39, 0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s34, v0
+; GCN-NEXT:    v_readlane_b32 s4, v39, 32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
@@ -653,81 +577,46 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
 ; GCN-NEXT:    s_add_i32 s5, s33, 0x42100
 ; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_add_i32 s5, s33, 0x42200
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GCN-NEXT:    v_mov_b32_e32 v0, s34
-; GCN-NEXT:    s_add_i32 s5, s33, 0x42300
+; GCN-NEXT:    v_writelane_b32 v39, s4, 32
+; GCN-NEXT:    v_writelane_b32 v39, s34, 33
 ; GCN-NEXT:    s_mov_b32 s34, s32
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s34 offset:4
 ; GCN-NEXT:    v_writelane_b32 v39, s39, 0
-; GCN-NEXT:    v_writelane_b32 v39, s40, 1
-; GCN-NEXT:    v_writelane_b32 v39, s41, 2
-; GCN-NEXT:    v_writelane_b32 v39, s42, 3
-; GCN-NEXT:    v_writelane_b32 v39, s43, 4
-; GCN-NEXT:    v_writelane_b32 v39, s44, 5
-; GCN-NEXT:    v_writelane_b32 v39, s45, 6
-; GCN-NEXT:    v_writelane_b32 v39, s46, 7
-; GCN-NEXT:    v_writelane_b32 v39, s47, 8
-; GCN-NEXT:    v_writelane_b32 v39, s48, 9
-; GCN-NEXT:    v_writelane_b32 v39, s49, 10
-; GCN-NEXT:    v_writelane_b32 v39, s50, 11
-; GCN-NEXT:    v_writelane_b32 v39, s51, 12
-; GCN-NEXT:    v_writelane_b32 v39, s52, 13
-; GCN-NEXT:    v_writelane_b32 v39, s53, 14
-; GCN-NEXT:    v_writelane_b32 v39, s54, 15
-; GCN-NEXT:    v_writelane_b32 v39, s55, 16
-; GCN-NEXT:    v_writelane_b32 v39, s56, 17
-; GCN-NEXT:    v_writelane_b32 v39, s57, 18
-; GCN-NEXT:    v_writelane_b32 v39, s58, 19
-; GCN-NEXT:    v_writelane_b32 v39, s59, 20
-; GCN-NEXT:    v_writelane_b32 v39, s60, 21
-; GCN-NEXT:    v_writelane_b32 v39, s61, 22
-; GCN-NEXT:    v_writelane_b32 v39, s62, 23
-; GCN-NEXT:    v_writelane_b32 v39, s63, 24
-; GCN-NEXT:    v_writelane_b32 v39, s64, 25
-; GCN-NEXT:    v_writelane_b32 v39, s65, 26
-; GCN-NEXT:    v_writelane_b32 v39, s66, 27
-; GCN-NEXT:    v_writelane_b32 v39, s67, 28
-; GCN-NEXT:    v_writelane_b32 v39, s68, 29
-; GCN-NEXT:    v_writelane_b32 v39, s69, 30
-; GCN-NEXT:    v_writelane_b32 v39, s70, 31
-; GCN-NEXT:    v_writelane_b32 v39, s71, 32
-; GCN-NEXT:    v_writelane_b32 v39, s72, 33
-; GCN-NEXT:    v_writelane_b32 v39, s73, 34
-; GCN-NEXT:    v_writelane_b32 v39, s74, 35
-; GCN-NEXT:    v_writelane_b32 v39, s75, 36
-; GCN-NEXT:    v_writelane_b32 v39, s76, 37
-; GCN-NEXT:    v_writelane_b32 v39, s77, 38
-; GCN-NEXT:    v_writelane_b32 v39, s78, 39
-; GCN-NEXT:    v_writelane_b32 v39, s79, 40
-; GCN-NEXT:    v_writelane_b32 v39, s80, 41
-; GCN-NEXT:    v_writelane_b32 v39, s81, 42
-; GCN-NEXT:    v_writelane_b32 v39, s82, 43
-; GCN-NEXT:    v_writelane_b32 v39, s83, 44
-; GCN-NEXT:    v_writelane_b32 v39, s84, 45
-; GCN-NEXT:    v_writelane_b32 v39, s85, 46
-; GCN-NEXT:    v_writelane_b32 v39, s86, 47
-; GCN-NEXT:    v_writelane_b32 v39, s87, 48
-; GCN-NEXT:    v_writelane_b32 v39, s88, 49
-; GCN-NEXT:    v_writelane_b32 v39, s89, 50
-; GCN-NEXT:    v_writelane_b32 v39, s90, 51
-; GCN-NEXT:    v_writelane_b32 v39, s91, 52
-; GCN-NEXT:    v_writelane_b32 v39, s92, 53
-; GCN-NEXT:    v_writelane_b32 v39, s93, 54
-; GCN-NEXT:    v_writelane_b32 v39, s94, 55
-; GCN-NEXT:    v_writelane_b32 v39, s95, 56
-; GCN-NEXT:    v_writelane_b32 v39, s96, 57
-; GCN-NEXT:    v_writelane_b32 v39, s97, 58
-; GCN-NEXT:    v_writelane_b32 v39, s98, 59
-; GCN-NEXT:    v_writelane_b32 v39, s99, 60
-; GCN-NEXT:    v_writelane_b32 v39, s100, 61
-; GCN-NEXT:    v_writelane_b32 v39, s101, 62
+; GCN-NEXT:    v_writelane_b32 v39, s48, 1
+; GCN-NEXT:    v_writelane_b32 v39, s49, 2
+; GCN-NEXT:    v_writelane_b32 v39, s50, 3
+; GCN-NEXT:    v_writelane_b32 v39, s51, 4
+; GCN-NEXT:    v_writelane_b32 v39, s52, 5
+; GCN-NEXT:    v_writelane_b32 v39, s53, 6
+; GCN-NEXT:    v_writelane_b32 v39, s54, 7
+; GCN-NEXT:    v_writelane_b32 v39, s55, 8
+; GCN-NEXT:    v_writelane_b32 v39, s64, 9
+; GCN-NEXT:    v_writelane_b32 v39, s65, 10
+; GCN-NEXT:    v_writelane_b32 v39, s66, 11
+; GCN-NEXT:    v_writelane_b32 v39, s67, 12
+; GCN-NEXT:    v_writelane_b32 v39, s68, 13
+; GCN-NEXT:    v_writelane_b32 v39, s69, 14
+; GCN-NEXT:    v_writelane_b32 v39, s70, 15
+; GCN-NEXT:    v_writelane_b32 v39, s71, 16
+; GCN-NEXT:    v_writelane_b32 v39, s80, 17
+; GCN-NEXT:    v_writelane_b32 v39, s81, 18
+; GCN-NEXT:    v_writelane_b32 v39, s82, 19
+; GCN-NEXT:    v_writelane_b32 v39, s83, 20
+; GCN-NEXT:    v_writelane_b32 v39, s84, 21
+; GCN-NEXT:    v_writelane_b32 v39, s85, 22
+; GCN-NEXT:    v_writelane_b32 v39, s86, 23
+; GCN-NEXT:    v_writelane_b32 v39, s87, 24
+; GCN-NEXT:    v_writelane_b32 v39, s96, 25
+; GCN-NEXT:    v_writelane_b32 v39, s97, 26
+; GCN-NEXT:    v_writelane_b32 v39, s98, 27
+; GCN-NEXT:    v_writelane_b32 v39, s99, 28
+; GCN-NEXT:    v_writelane_b32 v39, s100, 29
+; GCN-NEXT:    v_writelane_b32 v39, s101, 30
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x1080
-; GCN-NEXT:    v_writelane_b32 v39, s102, 63
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x46000
+; GCN-NEXT:    v_writelane_b32 v39, s102, 31
 ; GCN-NEXT:    s_mov_b32 s32, s34
+; GCN-NEXT:    v_readlane_b32 s34, v39, 33
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -737,78 +626,39 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; clobber all VGPRs
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_add_i32 s5, s33, 0x42200
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GCN-NEXT:    s_add_i32 s5, s33, 0x42300
-; GCN-NEXT:    v_readlane_b32 s102, v39, 63
-; GCN-NEXT:    v_readlane_b32 s101, v39, 62
-; GCN-NEXT:    v_readlane_b32 s100, v39, 61
-; GCN-NEXT:    v_readlane_b32 s99, v39, 60
-; GCN-NEXT:    v_readlane_b32 s98, v39, 59
-; GCN-NEXT:    v_readlane_b32 s97, v39, 58
-; GCN-NEXT:    v_readlane_b32 s96, v39, 57
-; GCN-NEXT:    v_readlane_b32 s95, v39, 56
-; GCN-NEXT:    v_readlane_b32 s94, v39, 55
-; GCN-NEXT:    v_readlane_b32 s93, v39, 54
-; GCN-NEXT:    v_readlane_b32 s92, v39, 53
-; GCN-NEXT:    v_readlane_b32 s91, v39, 52
-; GCN-NEXT:    v_readlane_b32 s90, v39, 51
-; GCN-NEXT:    v_readlane_b32 s89, v39, 50
-; GCN-NEXT:    v_readlane_b32 s88, v39, 49
-; GCN-NEXT:    v_readlane_b32 s87, v39, 48
-; GCN-NEXT:    v_readlane_b32 s86, v39, 47
-; GCN-NEXT:    v_readlane_b32 s85, v39, 46
-; GCN-NEXT:    v_readlane_b32 s84, v39, 45
-; GCN-NEXT:    v_readlane_b32 s83, v39, 44
-; GCN-NEXT:    v_readlane_b32 s82, v39, 43
-; GCN-NEXT:    v_readlane_b32 s81, v39, 42
-; GCN-NEXT:    v_readlane_b32 s80, v39, 41
-; GCN-NEXT:    v_readlane_b32 s79, v39, 40
-; GCN-NEXT:    v_readlane_b32 s78, v39, 39
-; GCN-NEXT:    v_readlane_b32 s77, v39, 38
-; GCN-NEXT:    v_readlane_b32 s76, v39, 37
-; GCN-NEXT:    v_readlane_b32 s75, v39, 36
-; GCN-NEXT:    v_readlane_b32 s74, v39, 35
-; GCN-NEXT:    v_readlane_b32 s73, v39, 34
-; GCN-NEXT:    v_readlane_b32 s72, v39, 33
-; GCN-NEXT:    v_readlane_b32 s71, v39, 32
-; GCN-NEXT:    v_readlane_b32 s70, v39, 31
-; GCN-NEXT:    v_readlane_b32 s69, v39, 30
-; GCN-NEXT:    v_readlane_b32 s68, v39, 29
-; GCN-NEXT:    v_readlane_b32 s67, v39, 28
-; GCN-NEXT:    v_readlane_b32 s66, v39, 27
-; GCN-NEXT:    v_readlane_b32 s65, v39, 26
-; GCN-NEXT:    v_readlane_b32 s64, v39, 25
-; GCN-NEXT:    v_readlane_b32 s63, v39, 24
-; GCN-NEXT:    v_readlane_b32 s62, v39, 23
-; GCN-NEXT:    v_readlane_b32 s61, v39, 22
-; GCN-NEXT:    v_readlane_b32 s60, v39, 21
-; GCN-NEXT:    v_readlane_b32 s59, v39, 20
-; GCN-NEXT:    v_readlane_b32 s58, v39, 19
-; GCN-NEXT:    v_readlane_b32 s57, v39, 18
-; GCN-NEXT:    v_readlane_b32 s56, v39, 17
-; GCN-NEXT:    v_readlane_b32 s55, v39, 16
-; GCN-NEXT:    v_readlane_b32 s54, v39, 15
-; GCN-NEXT:    v_readlane_b32 s53, v39, 14
-; GCN-NEXT:    v_readlane_b32 s52, v39, 13
-; GCN-NEXT:    v_readlane_b32 s51, v39, 12
-; GCN-NEXT:    v_readlane_b32 s50, v39, 11
-; GCN-NEXT:    v_readlane_b32 s49, v39, 10
-; GCN-NEXT:    v_readlane_b32 s48, v39, 9
-; GCN-NEXT:    v_readlane_b32 s47, v39, 8
-; GCN-NEXT:    v_readlane_b32 s46, v39, 7
-; GCN-NEXT:    v_readlane_b32 s45, v39, 6
-; GCN-NEXT:    v_readlane_b32 s44, v39, 5
-; GCN-NEXT:    v_readlane_b32 s43, v39, 4
-; GCN-NEXT:    v_readlane_b32 s42, v39, 3
-; GCN-NEXT:    v_readlane_b32 s41, v39, 2
-; GCN-NEXT:    v_readlane_b32 s40, v39, 1
+; GCN-NEXT:    v_readlane_b32 s102, v39, 31
+; GCN-NEXT:    v_readlane_b32 s101, v39, 30
+; GCN-NEXT:    v_readlane_b32 s100, v39, 29
+; GCN-NEXT:    v_readlane_b32 s99, v39, 28
+; GCN-NEXT:    v_readlane_b32 s98, v39, 27
+; GCN-NEXT:    v_readlane_b32 s97, v39, 26
+; GCN-NEXT:    v_readlane_b32 s96, v39, 25
+; GCN-NEXT:    v_readlane_b32 s87, v39, 24
+; GCN-NEXT:    v_readlane_b32 s86, v39, 23
+; GCN-NEXT:    v_readlane_b32 s85, v39, 22
+; GCN-NEXT:    v_readlane_b32 s84, v39, 21
+; GCN-NEXT:    v_readlane_b32 s83, v39, 20
+; GCN-NEXT:    v_readlane_b32 s82, v39, 19
+; GCN-NEXT:    v_readlane_b32 s81, v39, 18
+; GCN-NEXT:    v_readlane_b32 s80, v39, 17
+; GCN-NEXT:    v_readlane_b32 s71, v39, 16
+; GCN-NEXT:    v_readlane_b32 s70, v39, 15
+; GCN-NEXT:    v_readlane_b32 s69, v39, 14
+; GCN-NEXT:    v_readlane_b32 s68, v39, 13
+; GCN-NEXT:    v_readlane_b32 s67, v39, 12
+; GCN-NEXT:    v_readlane_b32 s66, v39, 11
+; GCN-NEXT:    v_readlane_b32 s65, v39, 10
+; GCN-NEXT:    v_readlane_b32 s64, v39, 9
+; GCN-NEXT:    v_readlane_b32 s55, v39, 8
+; GCN-NEXT:    v_readlane_b32 s54, v39, 7
+; GCN-NEXT:    v_readlane_b32 s53, v39, 6
+; GCN-NEXT:    v_readlane_b32 s52, v39, 5
+; GCN-NEXT:    v_readlane_b32 s51, v39, 4
+; GCN-NEXT:    v_readlane_b32 s50, v39, 3
+; GCN-NEXT:    v_readlane_b32 s49, v39, 2
+; GCN-NEXT:    v_readlane_b32 s48, v39, 1
 ; GCN-NEXT:    v_readlane_b32 s39, v39, 0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s34, v0
+; GCN-NEXT:    v_readlane_b32 s4, v39, 32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    s_add_i32 s5, s33, 0x42100
 ; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index e3a7f5eee74cb..0ad9573ff27cd 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -32,14 +32,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-LABEL: kernel:
 ; GLOBALNESS1:       ; %bb.0: ; %bb
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS1-NEXT:    s_load_dwordx4 s[76:79], s[8:9], 0x0
+; GLOBALNESS1-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
 ; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[76:77]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[52:53]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
@@ -49,7 +49,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s78, 0
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s54, 0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
 ; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
@@ -59,25 +59,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s8, 0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s9, 1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[70:71], 1, v3
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
-; GLOBALNESS1-NEXT:    s_mov_b32 s70, s16
-; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS1-NEXT:    s_mov_b32 s71, s15
-; GLOBALNESS1-NEXT:    s_mov_b32 s72, s14
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s16
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s15
+; GLOBALNESS1-NEXT:    s_mov_b32 s84, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
@@ -88,17 +91,27 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 2
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 3
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v3
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 4
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 5
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 6
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 7
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s70, 8
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s71, 9
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[60:61]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 7
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_28
 ; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -122,26 +135,26 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[46:47]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[70:71]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
 ; GLOBALNESS1-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s79, 1
+; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s55, 1
 ; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_7
 ; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock12
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s55, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_8
@@ -151,7 +164,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS1-NEXT:  .LBB1_8: ; %LeafBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s55, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow25
@@ -164,15 +177,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    flat_load_dword v0, v[44:45]
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[86:87], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[74:75], s[62:63]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[52:53], s[86:87]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_25
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v59, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v59, 3
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -185,69 +200,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[64:65], 0, v2
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[96:97], 0, v2
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[98:99], 1, v0
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_16
 ; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow16
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[68:69]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[48:49]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[64:65]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[42:43]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[44:45]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v59, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v59, 1
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb6.i.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[98:99]
 ; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[56:57]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[80:81]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_add_u32 s68, s38, 40
-; GLOBALNESS1-NEXT:    s_addc_u32 s69, s39, 0
+; GLOBALNESS1-NEXT:    s_add_u32 s70, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s71, s39, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
-; GLOBALNESS1-NEXT:    s_load_dwordx2 s[76:77], s[4:5], 0x0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[68:69]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[70:71]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[68:69]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[70:71]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -256,15 +273,22 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow23
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[38:39], 0x0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v59, 8
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v59, 9
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    s_mov_b32 s55, s7
 ; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow24
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[74:75]
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[52:53]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS1-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 5
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -286,12 +310,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
@@ -304,12 +328,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS1-NEXT:  .LBB1_33: ; %UnifiedUnreachableBlock
@@ -317,14 +341,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-LABEL: kernel:
 ; GLOBALNESS0:       ; %bb.0: ; %bb
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS0-NEXT:    s_load_dwordx4 s[72:75], s[8:9], 0x0
+; GLOBALNESS0-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[72:73]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[52:53]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
@@ -334,7 +358,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s74, 0
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s54, 0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
 ; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
@@ -344,25 +368,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s8, 0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s9, 1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v3
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
-; GLOBALNESS0-NEXT:    s_mov_b32 s68, s16
-; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS0-NEXT:    s_mov_b32 s69, s15
-; GLOBALNESS0-NEXT:    s_mov_b32 s70, s14
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s16
+; GLOBALNESS0-NEXT:    s_mov_b32 s71, s15
+; GLOBALNESS0-NEXT:    s_mov_b32 s82, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
@@ -373,17 +400,27 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 2
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v3
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 5
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 7
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s84, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s85, 9
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[60:61]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 7
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_28
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -407,26 +444,26 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[46:47]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[84:85]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
 ; GLOBALNESS0-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s75, 1
+; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s55, 1
 ; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_7
 ; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock12
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_8
@@ -436,7 +473,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS0-NEXT:  .LBB1_8: ; %LeafBlock
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow25
@@ -449,15 +486,18 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    flat_load_dword v0, v[44:45]
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[86:87], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[76:77], s[62:63]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[52:53], s[86:87]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_25
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v59, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v59, 3
+; GLOBALNESS0-NEXT:    s_mov_b32 s83, s55
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -470,69 +510,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[64:65], 0, v2
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[96:97], 0, v2
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[98:99], 1, v0
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_16
 ; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow16
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[68:69]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[48:49]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[64:65]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[42:43]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[44:45]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v59, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v59, 1
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb6.i.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[98:99]
 ; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[56:57]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[80:81]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_add_u32 s72, s38, 40
-; GLOBALNESS0-NEXT:    s_addc_u32 s73, s39, 0
+; GLOBALNESS0-NEXT:    s_add_u32 s84, s38, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s85, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
-; GLOBALNESS0-NEXT:    s_load_dwordx2 s[78:79], s[4:5], 0x0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[72:73]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[84:85]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[72:73]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[84:85]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[54:55]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -541,15 +583,20 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow23
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v59, 8
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    s_mov_b32 s55, s83
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v59, 9
 ; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow24
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[76:77]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[52:53]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 5
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -571,12 +618,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
@@ -589,12 +636,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS0-NEXT:  .LBB1_33: ; %UnifiedUnreachableBlock
@@ -604,7 +651,7 @@ bb:
   br label %bb5
 
 bb5:                                              ; preds = %bb5.backedge, %bb
-  %tmp4.i.sroa.0.0 = phi <9 x double> [ poison, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ]
+  %tmp4.i.sroa.0.0 = phi <9 x double> [ undef, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ]
   %tmp14.1.i = load i32, ptr inttoptr (i64 128 to ptr), align 128
   store i32 0, ptr addrspace(5) null, align 4
   %tmp14.2.i = load i32, ptr inttoptr (i64 128 to ptr), align 128

diff  --git a/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir
index b80c478c3761f..0df2e651a15e1 100644
--- a/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir
+++ b/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir
@@ -27,11 +27,11 @@ body:             |
     ; CHECK-NEXT: renamable $sgpr4 = COPY $sgpr0
     ; CHECK-NEXT: SI_SPILL_S128_SAVE $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: renamable $sgpr5 = S_MOV_B32 0
-    ; CHECK-NEXT: renamable $sgpr76 = COPY renamable $sgpr5
-    ; CHECK-NEXT: renamable $sgpr77 = COPY renamable $sgpr5
-    ; CHECK-NEXT: renamable $sgpr78 = COPY renamable $sgpr5
+    ; CHECK-NEXT: renamable $sgpr36 = COPY renamable $sgpr5
+    ; CHECK-NEXT: renamable $sgpr37 = COPY renamable $sgpr5
+    ; CHECK-NEXT: renamable $sgpr38 = COPY renamable $sgpr5
     ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1056964608
-    ; CHECK-NEXT: renamable $sgpr79 = COPY renamable $sgpr5
+    ; CHECK-NEXT: renamable $sgpr39 = COPY renamable $sgpr5
     ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
     ; CHECK-NEXT: renamable $sgpr8 = COPY renamable $sgpr5
     ; CHECK-NEXT: renamable $sgpr9 = COPY renamable $sgpr5
@@ -43,46 +43,46 @@ body:             |
     ; CHECK-NEXT: renamable $sgpr15 = COPY renamable $sgpr5
     ; CHECK-NEXT: renamable $vgpr5_vgpr6 = COPY killed renamable $sgpr0_sgpr1
     ; CHECK-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1088, 0 :: (dereferenceable load (s256), addrspace 6)
-    ; CHECK-NEXT: renamable $sgpr80_sgpr81_sgpr82_sgpr83 = S_LOAD_DWORDX4_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s128), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67 = S_LOAD_DWORDX4_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s128), addrspace 6)
     ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1200
     ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5
-    ; CHECK-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1152, 0 :: (dereferenceable load (s256), addrspace 6)
-    ; CHECK-NEXT: renamable $sgpr84_sgpr85_sgpr86_sgpr87 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1152, 0 :: (dereferenceable load (s256), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6)
     ; CHECK-NEXT: KILL killed renamable $sgpr0, renamable $sgpr1
     ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1264
     ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5
-    ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1216, 0 :: (dereferenceable load (s256), addrspace 6)
-    ; CHECK-NEXT: renamable $sgpr88_sgpr89_sgpr90_sgpr91 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1216, 0 :: (dereferenceable load (s256), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr80_sgpr81_sgpr82_sgpr83 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6)
     ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1328
     ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5
-    ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1280, 0 :: (dereferenceable load (s256), addrspace 6)
-    ; CHECK-NEXT: renamable $sgpr92_sgpr93_sgpr94_sgpr95 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6)
-    ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1344, 0 :: (dereferenceable load (s256), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1280, 0 :: (dereferenceable load (s256), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr84_sgpr85_sgpr86_sgpr87 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1344, 0 :: (dereferenceable load (s256), addrspace 6)
     ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1392
     ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5
-    ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s256), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s256), addrspace 6)
     ; CHECK-NEXT: renamable $sgpr2 = S_MOV_B32 1456
     ; CHECK-NEXT: renamable $sgpr3 = COPY renamable $sgpr5
-    ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1472, 0 :: (dereferenceable load (s256), addrspace 6)
+    ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1472, 0 :: (dereferenceable load (s256), addrspace 6)
     ; CHECK-NEXT: renamable $sgpr4 = S_MOV_B32 1520
     ; CHECK-NEXT: renamable $sgpr96_sgpr97_sgpr98_sgpr99 = S_LOAD_DWORDX4_IMM killed renamable $sgpr2_sgpr3, 0, 0 :: (load (s128), addrspace 6)
     ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (load (s128), addrspace 6)
     ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6)
-    ; CHECK-NEXT: renamable $vgpr7 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, killed renamable $sgpr76_sgpr77_sgpr78_sgpr79, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
-    ; CHECK-NEXT: renamable $vgpr8 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
-    ; CHECK-NEXT: renamable $vgpr9 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
-    ; CHECK-NEXT: renamable $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
-    ; CHECK-NEXT: renamable $vgpr11 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
-    ; CHECK-NEXT: renamable $vgpr12 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
-    ; CHECK-NEXT: renamable $vgpr13 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
-    ; CHECK-NEXT: renamable $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
+    ; CHECK-NEXT: renamable $vgpr7 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
+    ; CHECK-NEXT: renamable $vgpr8 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr64_sgpr65_sgpr66_sgpr67, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
+    ; CHECK-NEXT: renamable $vgpr9 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, killed renamable $sgpr68_sgpr69_sgpr70_sgpr71, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
+    ; CHECK-NEXT: renamable $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
+    ; CHECK-NEXT: renamable $vgpr11 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
+    ; CHECK-NEXT: renamable $vgpr12 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
+    ; CHECK-NEXT: renamable $vgpr13 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
+    ; CHECK-NEXT: renamable $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128))
     ; CHECK-NEXT: renamable $sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: renamable $vgpr1_vgpr2_vgpr3_vgpr4 = BUFFER_LOAD_FORMAT_XYZW_IDXEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7)
     ; CHECK-NEXT: KILL killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
-    ; CHECK-NEXT: KILL killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
+    ; CHECK-NEXT: KILL killed renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
     ; CHECK-NEXT: KILL killed renamable $vgpr5_vgpr6
     ; CHECK-NEXT: KILL killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
-    ; CHECK-NEXT: KILL killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
+    ; CHECK-NEXT: KILL killed renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; CHECK-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11
     ; CHECK-NEXT: KILL killed renamable $vgpr0
     ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_MAX_F32_e32 killed $vgpr7, killed $vgpr8, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index d9df80ce6c1c0..f08e5be0fd742 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -22,14 +22,14 @@ define hidden void @widget() {
 ; GCN-NEXT:    v_writelane_b32 v41, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v41, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v41, s39, 7
-; GCN-NEXT:    v_writelane_b32 v41, s40, 8
-; GCN-NEXT:    v_writelane_b32 v41, s41, 9
-; GCN-NEXT:    v_writelane_b32 v41, s42, 10
-; GCN-NEXT:    v_writelane_b32 v41, s43, 11
-; GCN-NEXT:    v_writelane_b32 v41, s44, 12
-; GCN-NEXT:    v_writelane_b32 v41, s45, 13
-; GCN-NEXT:    v_writelane_b32 v41, s46, 14
-; GCN-NEXT:    v_writelane_b32 v41, s47, 15
+; GCN-NEXT:    v_writelane_b32 v41, s48, 8
+; GCN-NEXT:    v_writelane_b32 v41, s49, 9
+; GCN-NEXT:    v_writelane_b32 v41, s50, 10
+; GCN-NEXT:    v_writelane_b32 v41, s51, 11
+; GCN-NEXT:    v_writelane_b32 v41, s52, 12
+; GCN-NEXT:    v_writelane_b32 v41, s53, 13
+; GCN-NEXT:    v_writelane_b32 v41, s54, 14
+; GCN-NEXT:    v_writelane_b32 v41, s55, 15
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
@@ -37,7 +37,7 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b64 s[16:17], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 21, v0
-; GCN-NEXT:    s_mov_b64 s[46:47], 0
+; GCN-NEXT:    s_mov_b64 s[54:55], 0
 ; GCN-NEXT:    s_mov_b64 s[18:19], 0
 ; GCN-NEXT:    s_cbranch_vccz .LBB0_9
 ; GCN-NEXT:  ; %bb.1: ; %Flow
@@ -53,29 +53,29 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[10:11]
-; GCN-NEXT:    s_mov_b32 s42, s12
-; GCN-NEXT:    s_mov_b32 s43, s13
-; GCN-NEXT:    s_mov_b32 s44, s14
-; GCN-NEXT:    s_mov_b32 s45, s15
+; GCN-NEXT:    s_mov_b64 s[48:49], s[10:11]
+; GCN-NEXT:    s_mov_b32 s50, s12
+; GCN-NEXT:    s_mov_b32 s51, s13
+; GCN-NEXT:    s_mov_b32 s52, s14
+; GCN-NEXT:    s_mov_b32 s53, s15
 ; GCN-NEXT:    v_mov_b32_e32 v40, v31
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_mov_b32_e32 v31, v40
-; GCN-NEXT:    s_mov_b32 s12, s42
-; GCN-NEXT:    s_mov_b32 s13, s43
-; GCN-NEXT:    s_mov_b32 s14, s44
-; GCN-NEXT:    s_mov_b32 s15, s45
+; GCN-NEXT:    s_mov_b32 s12, s50
+; GCN-NEXT:    s_mov_b32 s13, s51
+; GCN-NEXT:    s_mov_b32 s14, s52
+; GCN-NEXT:    s_mov_b32 s15, s53
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[34:35]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[38:39]
-; GCN-NEXT:    s_mov_b64 s[10:11], s[40:41]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[48:49]
 ; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-NEXT:    s_andn2_b64 s[18:19], s[46:47], exec
+; GCN-NEXT:    s_andn2_b64 s[18:19], s[54:55], exec
 ; GCN-NEXT:    s_and_b64 s[20:21], vcc, exec
-; GCN-NEXT:    s_or_b64 s[46:47], s[18:19], s[20:21]
+; GCN-NEXT:    s_or_b64 s[54:55], s[18:19], s[20:21]
 ; GCN-NEXT:  .LBB0_4: ; %Flow2
-; GCN-NEXT:    s_and_saveexec_b64 s[18:19], s[46:47]
+; GCN-NEXT:    s_and_saveexec_b64 s[18:19], s[54:55]
 ; GCN-NEXT:    s_xor_b64 s[18:19], exec, s[18:19]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_6
 ; GCN-NEXT:  ; %bb.5: ; %bb12
@@ -93,14 +93,14 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:  .LBB0_8: ; %UnifiedReturnBlock
-; GCN-NEXT:    v_readlane_b32 s47, v41, 15
-; GCN-NEXT:    v_readlane_b32 s46, v41, 14
-; GCN-NEXT:    v_readlane_b32 s45, v41, 13
-; GCN-NEXT:    v_readlane_b32 s44, v41, 12
-; GCN-NEXT:    v_readlane_b32 s43, v41, 11
-; GCN-NEXT:    v_readlane_b32 s42, v41, 10
-; GCN-NEXT:    v_readlane_b32 s41, v41, 9
-; GCN-NEXT:    v_readlane_b32 s40, v41, 8
+; GCN-NEXT:    v_readlane_b32 s55, v41, 15
+; GCN-NEXT:    v_readlane_b32 s54, v41, 14
+; GCN-NEXT:    v_readlane_b32 s53, v41, 13
+; GCN-NEXT:    v_readlane_b32 s52, v41, 12
+; GCN-NEXT:    v_readlane_b32 s51, v41, 11
+; GCN-NEXT:    v_readlane_b32 s50, v41, 10
+; GCN-NEXT:    v_readlane_b32 s49, v41, 9
+; GCN-NEXT:    v_readlane_b32 s48, v41, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v41, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v41, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v41, 5
@@ -119,7 +119,7 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-NEXT:  .LBB0_9: ; %bb2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[46:47], 21, v0
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[54:55], 21, v0
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[18:19], 21, v0
 ; GCN-NEXT:    s_mov_b64 vcc, exec
 ; GCN-NEXT:    s_cbranch_execnz .LBB0_2
@@ -274,51 +274,51 @@ define hidden void @blam() {
 ; GCN-NEXT:    v_writelane_b32 v45, s37, 5
 ; GCN-NEXT:    v_writelane_b32 v45, s38, 6
 ; GCN-NEXT:    v_writelane_b32 v45, s39, 7
-; GCN-NEXT:    v_writelane_b32 v45, s40, 8
-; GCN-NEXT:    v_writelane_b32 v45, s41, 9
-; GCN-NEXT:    v_writelane_b32 v45, s42, 10
-; GCN-NEXT:    v_writelane_b32 v45, s43, 11
-; GCN-NEXT:    v_writelane_b32 v45, s44, 12
-; GCN-NEXT:    v_writelane_b32 v45, s45, 13
-; GCN-NEXT:    v_writelane_b32 v45, s46, 14
-; GCN-NEXT:    v_writelane_b32 v45, s47, 15
-; GCN-NEXT:    v_writelane_b32 v45, s48, 16
-; GCN-NEXT:    v_writelane_b32 v45, s49, 17
-; GCN-NEXT:    v_writelane_b32 v45, s50, 18
-; GCN-NEXT:    v_writelane_b32 v45, s51, 19
-; GCN-NEXT:    v_writelane_b32 v45, s52, 20
-; GCN-NEXT:    v_writelane_b32 v45, s53, 21
-; GCN-NEXT:    v_writelane_b32 v45, s54, 22
-; GCN-NEXT:    v_writelane_b32 v45, s55, 23
-; GCN-NEXT:    v_writelane_b32 v45, s56, 24
-; GCN-NEXT:    v_writelane_b32 v45, s57, 25
+; GCN-NEXT:    v_writelane_b32 v45, s48, 8
+; GCN-NEXT:    v_writelane_b32 v45, s49, 9
+; GCN-NEXT:    v_writelane_b32 v45, s50, 10
+; GCN-NEXT:    v_writelane_b32 v45, s51, 11
+; GCN-NEXT:    v_writelane_b32 v45, s52, 12
+; GCN-NEXT:    v_writelane_b32 v45, s53, 13
+; GCN-NEXT:    v_writelane_b32 v45, s54, 14
+; GCN-NEXT:    v_writelane_b32 v45, s55, 15
+; GCN-NEXT:    v_writelane_b32 v45, s64, 16
+; GCN-NEXT:    v_writelane_b32 v45, s65, 17
+; GCN-NEXT:    v_writelane_b32 v45, s66, 18
+; GCN-NEXT:    v_writelane_b32 v45, s67, 19
+; GCN-NEXT:    v_writelane_b32 v45, s68, 20
+; GCN-NEXT:    v_writelane_b32 v45, s69, 21
+; GCN-NEXT:    v_writelane_b32 v45, s70, 22
+; GCN-NEXT:    v_writelane_b32 v45, s71, 23
+; GCN-NEXT:    v_writelane_b32 v45, s80, 24
+; GCN-NEXT:    v_writelane_b32 v45, s81, 25
 ; GCN-NEXT:    v_mov_b32_e32 v40, v31
-; GCN-NEXT:    s_mov_b32 s46, s15
-; GCN-NEXT:    s_mov_b32 s47, s14
-; GCN-NEXT:    s_mov_b32 s48, s13
-; GCN-NEXT:    s_mov_b32 s49, s12
+; GCN-NEXT:    s_mov_b32 s54, s15
+; GCN-NEXT:    s_mov_b32 s55, s14
+; GCN-NEXT:    s_mov_b32 s64, s13
+; GCN-NEXT:    s_mov_b32 s65, s12
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v40
 ; GCN-NEXT:    flat_load_dword v43, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v42, 0
-; GCN-NEXT:    s_mov_b64 s[50:51], 0
+; GCN-NEXT:    s_mov_b64 s[66:67], 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v41, 2, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_f32_e64 s[52:53], 0, v43
-; GCN-NEXT:    v_cmp_neq_f32_e64 s[42:43], 0, v43
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[68:69], 0, v43
+; GCN-NEXT:    v_cmp_neq_f32_e64 s[50:51], 0, v43
 ; GCN-NEXT:    v_mov_b32_e32 v44, 0x7fc00000
 ; GCN-NEXT:    s_branch .LBB1_2
 ; GCN-NEXT:  .LBB1_1: ; %Flow7
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_or_b64 s[50:51], s[4:5], s[50:51]
-; GCN-NEXT:    s_andn2_b64 exec, exec, s[50:51]
+; GCN-NEXT:    s_or_b64 s[66:67], s[4:5], s[66:67]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[66:67]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_18
 ; GCN-NEXT:  .LBB1_2: ; %bb2
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -329,26 +329,26 @@ define hidden void @blam() {
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 2, v0
 ; GCN-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-NEXT:    s_xor_b64 s[54:55], exec, s[8:9]
+; GCN-NEXT:    s_xor_b64 s[70:71], exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_12
 ; GCN-NEXT:  ; %bb.3: ; %bb6
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[44:45], 3, v0
-; GCN-NEXT:    s_and_saveexec_b64 s[56:57], s[44:45]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[52:53], 3, v0
+; GCN-NEXT:    s_and_saveexec_b64 s[80:81], s[52:53]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_11
 ; GCN-NEXT:  ; %bb.4: ; %bb11
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, spam at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, spam at rel32@hi+12
-; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GCN-NEXT:    s_mov_b32 s12, s49
-; GCN-NEXT:    s_mov_b32 s13, s48
-; GCN-NEXT:    s_mov_b32 s14, s47
-; GCN-NEXT:    s_mov_b32 s15, s46
+; GCN-NEXT:    s_mov_b32 s12, s65
+; GCN-NEXT:    s_mov_b32 s13, s64
+; GCN-NEXT:    s_mov_b32 s14, s55
+; GCN-NEXT:    s_mov_b32 s15, s54
 ; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
@@ -357,13 +357,13 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_10
 ; GCN-NEXT:  ; %bb.5: ; %bb14
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_mov_b64 s[8:9], s[52:53]
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[42:43]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[68:69]
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[50:51]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_7
 ; GCN-NEXT:  ; %bb.6: ; %bb16
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
-; GCN-NEXT:    s_or_b64 s[8:9], s[52:53], exec
+; GCN-NEXT:    s_or_b64 s[8:9], s[68:69], exec
 ; GCN-NEXT:  .LBB1_7: ; %Flow3
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
@@ -382,19 +382,19 @@ define hidden void @blam() {
 ; GCN-NEXT:  .LBB1_10: ; %Flow2
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_andn2_b64 s[4:5], s[44:45], exec
+; GCN-NEXT:    s_andn2_b64 s[4:5], s[52:53], exec
 ; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GCN-NEXT:    s_or_b64 s[44:45], s[4:5], s[8:9]
+; GCN-NEXT:    s_or_b64 s[52:53], s[4:5], s[8:9]
 ; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:  .LBB1_11: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_or_b64 exec, exec, s[56:57]
-; GCN-NEXT:    s_orn2_b64 s[4:5], s[44:45], exec
+; GCN-NEXT:    s_or_b64 exec, exec, s[80:81]
+; GCN-NEXT:    s_orn2_b64 s[4:5], s[52:53], exec
 ; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:  .LBB1_12: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_andn2_saveexec_b64 s[8:9], s[54:55]
+; GCN-NEXT:    s_andn2_saveexec_b64 s[8:9], s[70:71]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_16
 ; GCN-NEXT:  ; %bb.13: ; %bb8
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -426,25 +426,25 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_branch .LBB1_1
 ; GCN-NEXT:  .LBB1_18: ; %DummyReturnBlock
-; GCN-NEXT:    s_or_b64 exec, exec, s[50:51]
-; GCN-NEXT:    v_readlane_b32 s57, v45, 25
-; GCN-NEXT:    v_readlane_b32 s56, v45, 24
-; GCN-NEXT:    v_readlane_b32 s55, v45, 23
-; GCN-NEXT:    v_readlane_b32 s54, v45, 22
-; GCN-NEXT:    v_readlane_b32 s53, v45, 21
-; GCN-NEXT:    v_readlane_b32 s52, v45, 20
-; GCN-NEXT:    v_readlane_b32 s51, v45, 19
-; GCN-NEXT:    v_readlane_b32 s50, v45, 18
-; GCN-NEXT:    v_readlane_b32 s49, v45, 17
-; GCN-NEXT:    v_readlane_b32 s48, v45, 16
-; GCN-NEXT:    v_readlane_b32 s47, v45, 15
-; GCN-NEXT:    v_readlane_b32 s46, v45, 14
-; GCN-NEXT:    v_readlane_b32 s45, v45, 13
-; GCN-NEXT:    v_readlane_b32 s44, v45, 12
-; GCN-NEXT:    v_readlane_b32 s43, v45, 11
-; GCN-NEXT:    v_readlane_b32 s42, v45, 10
-; GCN-NEXT:    v_readlane_b32 s41, v45, 9
-; GCN-NEXT:    v_readlane_b32 s40, v45, 8
+; GCN-NEXT:    s_or_b64 exec, exec, s[66:67]
+; GCN-NEXT:    v_readlane_b32 s81, v45, 25
+; GCN-NEXT:    v_readlane_b32 s80, v45, 24
+; GCN-NEXT:    v_readlane_b32 s71, v45, 23
+; GCN-NEXT:    v_readlane_b32 s70, v45, 22
+; GCN-NEXT:    v_readlane_b32 s69, v45, 21
+; GCN-NEXT:    v_readlane_b32 s68, v45, 20
+; GCN-NEXT:    v_readlane_b32 s67, v45, 19
+; GCN-NEXT:    v_readlane_b32 s66, v45, 18
+; GCN-NEXT:    v_readlane_b32 s65, v45, 17
+; GCN-NEXT:    v_readlane_b32 s64, v45, 16
+; GCN-NEXT:    v_readlane_b32 s55, v45, 15
+; GCN-NEXT:    v_readlane_b32 s54, v45, 14
+; GCN-NEXT:    v_readlane_b32 s53, v45, 13
+; GCN-NEXT:    v_readlane_b32 s52, v45, 12
+; GCN-NEXT:    v_readlane_b32 s51, v45, 11
+; GCN-NEXT:    v_readlane_b32 s50, v45, 10
+; GCN-NEXT:    v_readlane_b32 s49, v45, 9
+; GCN-NEXT:    v_readlane_b32 s48, v45, 8
 ; GCN-NEXT:    v_readlane_b32 s39, v45, 7
 ; GCN-NEXT:    v_readlane_b32 s38, v45, 6
 ; GCN-NEXT:    v_readlane_b32 s37, v45, 5

diff  --git a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
index 8a0bf26f81d22..1e815f76ee149 100644
--- a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
+++ b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
@@ -37,100 +37,74 @@ body:             |
   ; MUBUF-LABEL: name: use_restore_frame_reg
   ; MUBUF: bb.0:
   ; MUBUF-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; MUBUF-NEXT:   liveins: $vgpr1, $vgpr2
+  ; MUBUF-NEXT:   liveins: $sgpr40, $sgpr41, $vgpr1
   ; MUBUF-NEXT: {{  $}}
-  ; MUBUF-NEXT:   $sgpr4 = COPY $sgpr33
+  ; MUBUF-NEXT:   $sgpr40 = frame-setup COPY $sgpr33
   ; MUBUF-NEXT:   $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
   ; MUBUF-NEXT:   $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-  ; MUBUF-NEXT:   $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; MUBUF-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc
-  ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5)
-  ; MUBUF-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
-  ; MUBUF-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
-  ; MUBUF-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+  ; MUBUF-NEXT:   $sgpr41 = frame-setup COPY $sgpr34
   ; MUBUF-NEXT:   $sgpr34 = frame-setup COPY $sgpr32
   ; MUBUF-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 11010048, implicit-def dead $scc
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
   ; MUBUF-NEXT:   $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
   ; MUBUF-NEXT:   $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec
-  ; MUBUF-NEXT:   $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-  ; MUBUF-NEXT:   $vgpr3 = V_ADD_U32_e32 155648, killed $vgpr3, implicit $exec
-  ; MUBUF-NEXT:   $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+  ; MUBUF-NEXT:   $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+  ; MUBUF-NEXT:   $vgpr2 = V_ADD_U32_e32 155648, killed $vgpr2, implicit $exec
+  ; MUBUF-NEXT:   $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
   ; MUBUF-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit $vcc
   ; MUBUF-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.1:
   ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
-  ; MUBUF-NEXT:   liveins: $vgpr2
+  ; MUBUF-NEXT:   liveins: $sgpr40, $sgpr41
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_NOP 0
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
-  ; MUBUF-NEXT:   liveins: $vgpr2
+  ; MUBUF-NEXT:   liveins: $sgpr40, $sgpr41
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   $sgpr32 = frame-destroy COPY $sgpr34
-  ; MUBUF-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
-  ; MUBUF-NEXT:   $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
-  ; MUBUF-NEXT:   $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; MUBUF-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc
-  ; MUBUF-NEXT:   $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5)
-  ; MUBUF-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
-  ; MUBUF-NEXT:   $sgpr33 = COPY $sgpr4
+  ; MUBUF-NEXT:   $sgpr34 = frame-destroy COPY $sgpr41
+  ; MUBUF-NEXT:   $sgpr33 = frame-destroy COPY $sgpr40
   ; MUBUF-NEXT:   S_ENDPGM 0
   ;
   ; FLATSCR-LABEL: name: use_restore_frame_reg
   ; FLATSCR: bb.0:
   ; FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; FLATSCR-NEXT:   liveins: $vgpr1, $vgpr2
+  ; FLATSCR-NEXT:   liveins: $sgpr40, $sgpr41, $vgpr1
   ; FLATSCR-NEXT: {{  $}}
-  ; FLATSCR-NEXT:   $sgpr4 = COPY $sgpr33
+  ; FLATSCR-NEXT:   $sgpr40 = frame-setup COPY $sgpr33
   ; FLATSCR-NEXT:   $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
   ; FLATSCR-NEXT:   $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
-  ; FLATSCR-NEXT:   $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; FLATSCR-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc
-  ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.20, addrspace 5)
-  ; FLATSCR-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
-  ; FLATSCR-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
-  ; FLATSCR-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+  ; FLATSCR-NEXT:   $sgpr41 = frame-setup COPY $sgpr34
   ; FLATSCR-NEXT:   $sgpr34 = frame-setup COPY $sgpr32
   ; FLATSCR-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 172032, implicit-def dead $scc
   ; FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; FLATSCR-NEXT:   S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-  ; FLATSCR-NEXT:   $sgpr33 = S_ADDC_U32 $sgpr33, 8192, implicit-def $scc, implicit $scc
-  ; FLATSCR-NEXT:   S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc
-  ; FLATSCR-NEXT:   $sgpr33 = S_BITSET0_B32 0, $sgpr33
-  ; FLATSCR-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec
-  ; FLATSCR-NEXT:   $sgpr33 = S_ADDC_U32 $sgpr33, -8192, implicit-def $scc, implicit $scc
-  ; FLATSCR-NEXT:   S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc
-  ; FLATSCR-NEXT:   $sgpr33 = S_BITSET0_B32 0, $sgpr33
-  ; FLATSCR-NEXT:   $sgpr33 = S_ADDC_U32 $sgpr33, 155648, implicit-def $scc, implicit $scc
-  ; FLATSCR-NEXT:   S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc
-  ; FLATSCR-NEXT:   $sgpr33 = S_BITSET0_B32 0, $sgpr33
-  ; FLATSCR-NEXT:   $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
-  ; FLATSCR-NEXT:   $sgpr33 = S_ADDC_U32 $sgpr33, -155648, implicit-def $scc, implicit $scc
-  ; FLATSCR-NEXT:   S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc
-  ; FLATSCR-NEXT:   $sgpr33 = S_BITSET0_B32 0, $sgpr33
+  ; FLATSCR-NEXT:   $sgpr42 = S_ADDC_U32 $sgpr33, 8192, implicit-def $scc, implicit $scc
+  ; FLATSCR-NEXT:   S_BITCMP1_B32 $sgpr42, 0, implicit-def $scc
+  ; FLATSCR-NEXT:   $sgpr42 = S_BITSET0_B32 0, $sgpr42
+  ; FLATSCR-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr42, implicit $exec
+  ; FLATSCR-NEXT:   $sgpr42 = S_ADDC_U32 $sgpr33, 155648, implicit-def $scc, implicit $scc
+  ; FLATSCR-NEXT:   S_BITCMP1_B32 $sgpr42, 0, implicit-def $scc
+  ; FLATSCR-NEXT:   $sgpr42 = S_BITSET0_B32 0, $sgpr42
+  ; FLATSCR-NEXT:   $vgpr0 = V_OR_B32_e32 killed $sgpr42, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
   ; FLATSCR-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit $vcc
   ; FLATSCR-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
   ; FLATSCR-NEXT: {{  $}}
   ; FLATSCR-NEXT: bb.1:
   ; FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; FLATSCR-NEXT:   liveins: $vgpr2
+  ; FLATSCR-NEXT:   liveins: $sgpr40, $sgpr41
   ; FLATSCR-NEXT: {{  $}}
   ; FLATSCR-NEXT:   S_NOP 0
   ; FLATSCR-NEXT: {{  $}}
   ; FLATSCR-NEXT: bb.2:
-  ; FLATSCR-NEXT:   liveins: $vgpr2
+  ; FLATSCR-NEXT:   liveins: $sgpr40, $sgpr41
   ; FLATSCR-NEXT: {{  $}}
   ; FLATSCR-NEXT:   $sgpr32 = frame-destroy COPY $sgpr34
-  ; FLATSCR-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
-  ; FLATSCR-NEXT:   $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
-  ; FLATSCR-NEXT:   $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; FLATSCR-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc
-  ; FLATSCR-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.20, addrspace 5)
-  ; FLATSCR-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
-  ; FLATSCR-NEXT:   $sgpr33 = COPY $sgpr4
+  ; FLATSCR-NEXT:   $sgpr34 = frame-destroy COPY $sgpr41
+  ; FLATSCR-NEXT:   $sgpr33 = frame-destroy COPY $sgpr40
   ; FLATSCR-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
index 5ced02f28c977..2ee62d13fcc51 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
@@ -35,34 +35,18 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX900-NEXT:    v_writelane_b32 v63, s37, 1
 ; GFX900-NEXT:    v_writelane_b32 v63, s38, 2
 ; GFX900-NEXT:    v_writelane_b32 v63, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v63, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v63, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v63, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v63, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v63, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v63, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v63, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v63, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v63, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v63, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v63, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v63, s51, 15
-; GFX900-NEXT:    v_writelane_b32 v63, s52, 16
-; GFX900-NEXT:    v_writelane_b32 v63, s53, 17
-; GFX900-NEXT:    v_writelane_b32 v63, s54, 18
-; GFX900-NEXT:    v_writelane_b32 v63, s55, 19
-; GFX900-NEXT:    v_writelane_b32 v63, s56, 20
-; GFX900-NEXT:    v_writelane_b32 v63, s57, 21
-; GFX900-NEXT:    v_writelane_b32 v63, s58, 22
-; GFX900-NEXT:    v_writelane_b32 v63, s59, 23
-; GFX900-NEXT:    v_writelane_b32 v63, s60, 24
-; GFX900-NEXT:    v_writelane_b32 v63, s61, 25
-; GFX900-NEXT:    v_writelane_b32 v63, s62, 26
-; GFX900-NEXT:    v_writelane_b32 v63, s63, 27
-; GFX900-NEXT:    v_writelane_b32 v63, s64, 28
-; GFX900-NEXT:    v_writelane_b32 v63, s65, 29
-; GFX900-NEXT:    v_writelane_b32 v63, s66, 30
-; GFX900-NEXT:    v_writelane_b32 v63, s67, 31
+; GFX900-NEXT:    v_writelane_b32 v63, s48, 4
+; GFX900-NEXT:    v_writelane_b32 v63, s49, 5
+; GFX900-NEXT:    v_writelane_b32 v63, s50, 6
+; GFX900-NEXT:    v_writelane_b32 v63, s51, 7
+; GFX900-NEXT:    v_writelane_b32 v63, s52, 8
+; GFX900-NEXT:    v_writelane_b32 v63, s53, 9
+; GFX900-NEXT:    v_writelane_b32 v63, s54, 10
+; GFX900-NEXT:    v_writelane_b32 v63, s55, 11
+; GFX900-NEXT:    v_writelane_b32 v63, s64, 12
+; GFX900-NEXT:    v_writelane_b32 v63, s65, 13
+; GFX900-NEXT:    v_writelane_b32 v63, s66, 14
+; GFX900-NEXT:    v_writelane_b32 v63, s67, 15
 ; GFX900-NEXT:    v_mov_b32_e32 v33, v30
 ; GFX900-NEXT:    v_mov_b32_e32 v34, v29
 ; GFX900-NEXT:    v_mov_b32_e32 v35, v28
@@ -160,34 +144,18 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX900-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX900-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    v_readlane_b32 s67, v63, 31
-; GFX900-NEXT:    v_readlane_b32 s66, v63, 30
-; GFX900-NEXT:    v_readlane_b32 s65, v63, 29
-; GFX900-NEXT:    v_readlane_b32 s64, v63, 28
-; GFX900-NEXT:    v_readlane_b32 s63, v63, 27
-; GFX900-NEXT:    v_readlane_b32 s62, v63, 26
-; GFX900-NEXT:    v_readlane_b32 s61, v63, 25
-; GFX900-NEXT:    v_readlane_b32 s60, v63, 24
-; GFX900-NEXT:    v_readlane_b32 s59, v63, 23
-; GFX900-NEXT:    v_readlane_b32 s58, v63, 22
-; GFX900-NEXT:    v_readlane_b32 s57, v63, 21
-; GFX900-NEXT:    v_readlane_b32 s56, v63, 20
-; GFX900-NEXT:    v_readlane_b32 s55, v63, 19
-; GFX900-NEXT:    v_readlane_b32 s54, v63, 18
-; GFX900-NEXT:    v_readlane_b32 s53, v63, 17
-; GFX900-NEXT:    v_readlane_b32 s52, v63, 16
-; GFX900-NEXT:    v_readlane_b32 s51, v63, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v63, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v63, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v63, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v63, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v63, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v63, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v63, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v63, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v63, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v63, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v63, 4
+; GFX900-NEXT:    v_readlane_b32 s67, v63, 15
+; GFX900-NEXT:    v_readlane_b32 s66, v63, 14
+; GFX900-NEXT:    v_readlane_b32 s65, v63, 13
+; GFX900-NEXT:    v_readlane_b32 s64, v63, 12
+; GFX900-NEXT:    v_readlane_b32 s55, v63, 11
+; GFX900-NEXT:    v_readlane_b32 s54, v63, 10
+; GFX900-NEXT:    v_readlane_b32 s53, v63, 9
+; GFX900-NEXT:    v_readlane_b32 s52, v63, 8
+; GFX900-NEXT:    v_readlane_b32 s51, v63, 7
+; GFX900-NEXT:    v_readlane_b32 s50, v63, 6
+; GFX900-NEXT:    v_readlane_b32 s49, v63, 5
+; GFX900-NEXT:    v_readlane_b32 s48, v63, 4
 ; GFX900-NEXT:    v_readlane_b32 s39, v63, 3
 ; GFX900-NEXT:    v_readlane_b32 s38, v63, 2
 ; GFX900-NEXT:    v_readlane_b32 s37, v63, 1
@@ -238,34 +206,18 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX906-NEXT:    v_writelane_b32 v63, s37, 1
 ; GFX906-NEXT:    v_writelane_b32 v63, s38, 2
 ; GFX906-NEXT:    v_writelane_b32 v63, s39, 3
-; GFX906-NEXT:    v_writelane_b32 v63, s40, 4
-; GFX906-NEXT:    v_writelane_b32 v63, s41, 5
-; GFX906-NEXT:    v_writelane_b32 v63, s42, 6
-; GFX906-NEXT:    v_writelane_b32 v63, s43, 7
-; GFX906-NEXT:    v_writelane_b32 v63, s44, 8
-; GFX906-NEXT:    v_writelane_b32 v63, s45, 9
-; GFX906-NEXT:    v_writelane_b32 v63, s46, 10
-; GFX906-NEXT:    v_writelane_b32 v63, s47, 11
-; GFX906-NEXT:    v_writelane_b32 v63, s48, 12
-; GFX906-NEXT:    v_writelane_b32 v63, s49, 13
-; GFX906-NEXT:    v_writelane_b32 v63, s50, 14
-; GFX906-NEXT:    v_writelane_b32 v63, s51, 15
-; GFX906-NEXT:    v_writelane_b32 v63, s52, 16
-; GFX906-NEXT:    v_writelane_b32 v63, s53, 17
-; GFX906-NEXT:    v_writelane_b32 v63, s54, 18
-; GFX906-NEXT:    v_writelane_b32 v63, s55, 19
-; GFX906-NEXT:    v_writelane_b32 v63, s56, 20
-; GFX906-NEXT:    v_writelane_b32 v63, s57, 21
-; GFX906-NEXT:    v_writelane_b32 v63, s58, 22
-; GFX906-NEXT:    v_writelane_b32 v63, s59, 23
-; GFX906-NEXT:    v_writelane_b32 v63, s60, 24
-; GFX906-NEXT:    v_writelane_b32 v63, s61, 25
-; GFX906-NEXT:    v_writelane_b32 v63, s62, 26
-; GFX906-NEXT:    v_writelane_b32 v63, s63, 27
-; GFX906-NEXT:    v_writelane_b32 v63, s64, 28
-; GFX906-NEXT:    v_writelane_b32 v63, s65, 29
-; GFX906-NEXT:    v_writelane_b32 v63, s66, 30
-; GFX906-NEXT:    v_writelane_b32 v63, s67, 31
+; GFX906-NEXT:    v_writelane_b32 v63, s48, 4
+; GFX906-NEXT:    v_writelane_b32 v63, s49, 5
+; GFX906-NEXT:    v_writelane_b32 v63, s50, 6
+; GFX906-NEXT:    v_writelane_b32 v63, s51, 7
+; GFX906-NEXT:    v_writelane_b32 v63, s52, 8
+; GFX906-NEXT:    v_writelane_b32 v63, s53, 9
+; GFX906-NEXT:    v_writelane_b32 v63, s54, 10
+; GFX906-NEXT:    v_writelane_b32 v63, s55, 11
+; GFX906-NEXT:    v_writelane_b32 v63, s64, 12
+; GFX906-NEXT:    v_writelane_b32 v63, s65, 13
+; GFX906-NEXT:    v_writelane_b32 v63, s66, 14
+; GFX906-NEXT:    v_writelane_b32 v63, s67, 15
 ; GFX906-NEXT:    v_mov_b32_e32 v33, v30
 ; GFX906-NEXT:    v_mov_b32_e32 v34, v29
 ; GFX906-NEXT:    v_mov_b32_e32 v35, v28
@@ -363,34 +315,18 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX906-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX906-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    v_readlane_b32 s67, v63, 31
-; GFX906-NEXT:    v_readlane_b32 s66, v63, 30
-; GFX906-NEXT:    v_readlane_b32 s65, v63, 29
-; GFX906-NEXT:    v_readlane_b32 s64, v63, 28
-; GFX906-NEXT:    v_readlane_b32 s63, v63, 27
-; GFX906-NEXT:    v_readlane_b32 s62, v63, 26
-; GFX906-NEXT:    v_readlane_b32 s61, v63, 25
-; GFX906-NEXT:    v_readlane_b32 s60, v63, 24
-; GFX906-NEXT:    v_readlane_b32 s59, v63, 23
-; GFX906-NEXT:    v_readlane_b32 s58, v63, 22
-; GFX906-NEXT:    v_readlane_b32 s57, v63, 21
-; GFX906-NEXT:    v_readlane_b32 s56, v63, 20
-; GFX906-NEXT:    v_readlane_b32 s55, v63, 19
-; GFX906-NEXT:    v_readlane_b32 s54, v63, 18
-; GFX906-NEXT:    v_readlane_b32 s53, v63, 17
-; GFX906-NEXT:    v_readlane_b32 s52, v63, 16
-; GFX906-NEXT:    v_readlane_b32 s51, v63, 15
-; GFX906-NEXT:    v_readlane_b32 s50, v63, 14
-; GFX906-NEXT:    v_readlane_b32 s49, v63, 13
-; GFX906-NEXT:    v_readlane_b32 s48, v63, 12
-; GFX906-NEXT:    v_readlane_b32 s47, v63, 11
-; GFX906-NEXT:    v_readlane_b32 s46, v63, 10
-; GFX906-NEXT:    v_readlane_b32 s45, v63, 9
-; GFX906-NEXT:    v_readlane_b32 s44, v63, 8
-; GFX906-NEXT:    v_readlane_b32 s43, v63, 7
-; GFX906-NEXT:    v_readlane_b32 s42, v63, 6
-; GFX906-NEXT:    v_readlane_b32 s41, v63, 5
-; GFX906-NEXT:    v_readlane_b32 s40, v63, 4
+; GFX906-NEXT:    v_readlane_b32 s67, v63, 15
+; GFX906-NEXT:    v_readlane_b32 s66, v63, 14
+; GFX906-NEXT:    v_readlane_b32 s65, v63, 13
+; GFX906-NEXT:    v_readlane_b32 s64, v63, 12
+; GFX906-NEXT:    v_readlane_b32 s55, v63, 11
+; GFX906-NEXT:    v_readlane_b32 s54, v63, 10
+; GFX906-NEXT:    v_readlane_b32 s53, v63, 9
+; GFX906-NEXT:    v_readlane_b32 s52, v63, 8
+; GFX906-NEXT:    v_readlane_b32 s51, v63, 7
+; GFX906-NEXT:    v_readlane_b32 s50, v63, 6
+; GFX906-NEXT:    v_readlane_b32 s49, v63, 5
+; GFX906-NEXT:    v_readlane_b32 s48, v63, 4
 ; GFX906-NEXT:    v_readlane_b32 s39, v63, 3
 ; GFX906-NEXT:    v_readlane_b32 s38, v63, 2
 ; GFX906-NEXT:    v_readlane_b32 s37, v63, 1
@@ -440,34 +376,18 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX908-NEXT:    v_writelane_b32 v62, s37, 1
 ; GFX908-NEXT:    v_writelane_b32 v62, s38, 2
 ; GFX908-NEXT:    v_writelane_b32 v62, s39, 3
-; GFX908-NEXT:    v_writelane_b32 v62, s40, 4
-; GFX908-NEXT:    v_writelane_b32 v62, s41, 5
-; GFX908-NEXT:    v_writelane_b32 v62, s42, 6
-; GFX908-NEXT:    v_writelane_b32 v62, s43, 7
-; GFX908-NEXT:    v_writelane_b32 v62, s44, 8
-; GFX908-NEXT:    v_writelane_b32 v62, s45, 9
-; GFX908-NEXT:    v_writelane_b32 v62, s46, 10
-; GFX908-NEXT:    v_writelane_b32 v62, s47, 11
-; GFX908-NEXT:    v_writelane_b32 v62, s48, 12
-; GFX908-NEXT:    v_writelane_b32 v62, s49, 13
-; GFX908-NEXT:    v_writelane_b32 v62, s50, 14
-; GFX908-NEXT:    v_writelane_b32 v62, s51, 15
-; GFX908-NEXT:    v_writelane_b32 v62, s52, 16
-; GFX908-NEXT:    v_writelane_b32 v62, s53, 17
-; GFX908-NEXT:    v_writelane_b32 v62, s54, 18
-; GFX908-NEXT:    v_writelane_b32 v62, s55, 19
-; GFX908-NEXT:    v_writelane_b32 v62, s56, 20
-; GFX908-NEXT:    v_writelane_b32 v62, s57, 21
-; GFX908-NEXT:    v_writelane_b32 v62, s58, 22
-; GFX908-NEXT:    v_writelane_b32 v62, s59, 23
-; GFX908-NEXT:    v_writelane_b32 v62, s60, 24
-; GFX908-NEXT:    v_writelane_b32 v62, s61, 25
-; GFX908-NEXT:    v_writelane_b32 v62, s62, 26
-; GFX908-NEXT:    v_writelane_b32 v62, s63, 27
-; GFX908-NEXT:    v_writelane_b32 v62, s64, 28
-; GFX908-NEXT:    v_writelane_b32 v62, s65, 29
-; GFX908-NEXT:    v_writelane_b32 v62, s66, 30
-; GFX908-NEXT:    v_writelane_b32 v62, s67, 31
+; GFX908-NEXT:    v_writelane_b32 v62, s48, 4
+; GFX908-NEXT:    v_writelane_b32 v62, s49, 5
+; GFX908-NEXT:    v_writelane_b32 v62, s50, 6
+; GFX908-NEXT:    v_writelane_b32 v62, s51, 7
+; GFX908-NEXT:    v_writelane_b32 v62, s52, 8
+; GFX908-NEXT:    v_writelane_b32 v62, s53, 9
+; GFX908-NEXT:    v_writelane_b32 v62, s54, 10
+; GFX908-NEXT:    v_writelane_b32 v62, s55, 11
+; GFX908-NEXT:    v_writelane_b32 v62, s64, 12
+; GFX908-NEXT:    v_writelane_b32 v62, s65, 13
+; GFX908-NEXT:    v_writelane_b32 v62, s66, 14
+; GFX908-NEXT:    v_writelane_b32 v62, s67, 15
 ; GFX908-NEXT:    v_mov_b32_e32 v33, v30
 ; GFX908-NEXT:    v_mov_b32_e32 v34, v29
 ; GFX908-NEXT:    v_mov_b32_e32 v35, v28
@@ -569,34 +489,18 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX908-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX908-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
-; GFX908-NEXT:    v_readlane_b32 s67, v62, 31
-; GFX908-NEXT:    v_readlane_b32 s66, v62, 30
-; GFX908-NEXT:    v_readlane_b32 s65, v62, 29
-; GFX908-NEXT:    v_readlane_b32 s64, v62, 28
-; GFX908-NEXT:    v_readlane_b32 s63, v62, 27
-; GFX908-NEXT:    v_readlane_b32 s62, v62, 26
-; GFX908-NEXT:    v_readlane_b32 s61, v62, 25
-; GFX908-NEXT:    v_readlane_b32 s60, v62, 24
-; GFX908-NEXT:    v_readlane_b32 s59, v62, 23
-; GFX908-NEXT:    v_readlane_b32 s58, v62, 22
-; GFX908-NEXT:    v_readlane_b32 s57, v62, 21
-; GFX908-NEXT:    v_readlane_b32 s56, v62, 20
-; GFX908-NEXT:    v_readlane_b32 s55, v62, 19
-; GFX908-NEXT:    v_readlane_b32 s54, v62, 18
-; GFX908-NEXT:    v_readlane_b32 s53, v62, 17
-; GFX908-NEXT:    v_readlane_b32 s52, v62, 16
-; GFX908-NEXT:    v_readlane_b32 s51, v62, 15
-; GFX908-NEXT:    v_readlane_b32 s50, v62, 14
-; GFX908-NEXT:    v_readlane_b32 s49, v62, 13
-; GFX908-NEXT:    v_readlane_b32 s48, v62, 12
-; GFX908-NEXT:    v_readlane_b32 s47, v62, 11
-; GFX908-NEXT:    v_readlane_b32 s46, v62, 10
-; GFX908-NEXT:    v_readlane_b32 s45, v62, 9
-; GFX908-NEXT:    v_readlane_b32 s44, v62, 8
-; GFX908-NEXT:    v_readlane_b32 s43, v62, 7
-; GFX908-NEXT:    v_readlane_b32 s42, v62, 6
-; GFX908-NEXT:    v_readlane_b32 s41, v62, 5
-; GFX908-NEXT:    v_readlane_b32 s40, v62, 4
+; GFX908-NEXT:    v_readlane_b32 s67, v62, 15
+; GFX908-NEXT:    v_readlane_b32 s66, v62, 14
+; GFX908-NEXT:    v_readlane_b32 s65, v62, 13
+; GFX908-NEXT:    v_readlane_b32 s64, v62, 12
+; GFX908-NEXT:    v_readlane_b32 s55, v62, 11
+; GFX908-NEXT:    v_readlane_b32 s54, v62, 10
+; GFX908-NEXT:    v_readlane_b32 s53, v62, 9
+; GFX908-NEXT:    v_readlane_b32 s52, v62, 8
+; GFX908-NEXT:    v_readlane_b32 s51, v62, 7
+; GFX908-NEXT:    v_readlane_b32 s50, v62, 6
+; GFX908-NEXT:    v_readlane_b32 s49, v62, 5
+; GFX908-NEXT:    v_readlane_b32 s48, v62, 4
 ; GFX908-NEXT:    v_readlane_b32 s39, v62, 3
 ; GFX908-NEXT:    v_readlane_b32 s38, v62, 2
 ; GFX908-NEXT:    v_readlane_b32 s37, v62, 1
@@ -646,34 +550,18 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX90a-NEXT:    v_writelane_b32 v63, s37, 1
 ; GFX90a-NEXT:    v_writelane_b32 v63, s38, 2
 ; GFX90a-NEXT:    v_writelane_b32 v63, s39, 3
-; GFX90a-NEXT:    v_writelane_b32 v63, s40, 4
-; GFX90a-NEXT:    v_writelane_b32 v63, s41, 5
-; GFX90a-NEXT:    v_writelane_b32 v63, s42, 6
-; GFX90a-NEXT:    v_writelane_b32 v63, s43, 7
-; GFX90a-NEXT:    v_writelane_b32 v63, s44, 8
-; GFX90a-NEXT:    v_writelane_b32 v63, s45, 9
-; GFX90a-NEXT:    v_writelane_b32 v63, s46, 10
-; GFX90a-NEXT:    v_writelane_b32 v63, s47, 11
-; GFX90a-NEXT:    v_writelane_b32 v63, s48, 12
-; GFX90a-NEXT:    v_writelane_b32 v63, s49, 13
-; GFX90a-NEXT:    v_writelane_b32 v63, s50, 14
-; GFX90a-NEXT:    v_writelane_b32 v63, s51, 15
-; GFX90a-NEXT:    v_writelane_b32 v63, s52, 16
-; GFX90a-NEXT:    v_writelane_b32 v63, s53, 17
-; GFX90a-NEXT:    v_writelane_b32 v63, s54, 18
-; GFX90a-NEXT:    v_writelane_b32 v63, s55, 19
-; GFX90a-NEXT:    v_writelane_b32 v63, s56, 20
-; GFX90a-NEXT:    v_writelane_b32 v63, s57, 21
-; GFX90a-NEXT:    v_writelane_b32 v63, s58, 22
-; GFX90a-NEXT:    v_writelane_b32 v63, s59, 23
-; GFX90a-NEXT:    v_writelane_b32 v63, s60, 24
-; GFX90a-NEXT:    v_writelane_b32 v63, s61, 25
-; GFX90a-NEXT:    v_writelane_b32 v63, s62, 26
-; GFX90a-NEXT:    v_writelane_b32 v63, s63, 27
-; GFX90a-NEXT:    v_writelane_b32 v63, s64, 28
-; GFX90a-NEXT:    v_writelane_b32 v63, s65, 29
-; GFX90a-NEXT:    v_writelane_b32 v63, s66, 30
-; GFX90a-NEXT:    v_writelane_b32 v63, s67, 31
+; GFX90a-NEXT:    v_writelane_b32 v63, s48, 4
+; GFX90a-NEXT:    v_writelane_b32 v63, s49, 5
+; GFX90a-NEXT:    v_writelane_b32 v63, s50, 6
+; GFX90a-NEXT:    v_writelane_b32 v63, s51, 7
+; GFX90a-NEXT:    v_writelane_b32 v63, s52, 8
+; GFX90a-NEXT:    v_writelane_b32 v63, s53, 9
+; GFX90a-NEXT:    v_writelane_b32 v63, s54, 10
+; GFX90a-NEXT:    v_writelane_b32 v63, s55, 11
+; GFX90a-NEXT:    v_writelane_b32 v63, s64, 12
+; GFX90a-NEXT:    v_writelane_b32 v63, s65, 13
+; GFX90a-NEXT:    v_writelane_b32 v63, s66, 14
+; GFX90a-NEXT:    v_writelane_b32 v63, s67, 15
 ; GFX90a-NEXT:    v_mov_b32_e32 v33, v30
 ; GFX90a-NEXT:    v_mov_b32_e32 v34, v29
 ; GFX90a-NEXT:    v_mov_b32_e32 v35, v28
@@ -771,34 +659,18 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX90a-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX90a-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_readlane_b32 s67, v63, 31
-; GFX90a-NEXT:    v_readlane_b32 s66, v63, 30
-; GFX90a-NEXT:    v_readlane_b32 s65, v63, 29
-; GFX90a-NEXT:    v_readlane_b32 s64, v63, 28
-; GFX90a-NEXT:    v_readlane_b32 s63, v63, 27
-; GFX90a-NEXT:    v_readlane_b32 s62, v63, 26
-; GFX90a-NEXT:    v_readlane_b32 s61, v63, 25
-; GFX90a-NEXT:    v_readlane_b32 s60, v63, 24
-; GFX90a-NEXT:    v_readlane_b32 s59, v63, 23
-; GFX90a-NEXT:    v_readlane_b32 s58, v63, 22
-; GFX90a-NEXT:    v_readlane_b32 s57, v63, 21
-; GFX90a-NEXT:    v_readlane_b32 s56, v63, 20
-; GFX90a-NEXT:    v_readlane_b32 s55, v63, 19
-; GFX90a-NEXT:    v_readlane_b32 s54, v63, 18
-; GFX90a-NEXT:    v_readlane_b32 s53, v63, 17
-; GFX90a-NEXT:    v_readlane_b32 s52, v63, 16
-; GFX90a-NEXT:    v_readlane_b32 s51, v63, 15
-; GFX90a-NEXT:    v_readlane_b32 s50, v63, 14
-; GFX90a-NEXT:    v_readlane_b32 s49, v63, 13
-; GFX90a-NEXT:    v_readlane_b32 s48, v63, 12
-; GFX90a-NEXT:    v_readlane_b32 s47, v63, 11
-; GFX90a-NEXT:    v_readlane_b32 s46, v63, 10
-; GFX90a-NEXT:    v_readlane_b32 s45, v63, 9
-; GFX90a-NEXT:    v_readlane_b32 s44, v63, 8
-; GFX90a-NEXT:    v_readlane_b32 s43, v63, 7
-; GFX90a-NEXT:    v_readlane_b32 s42, v63, 6
-; GFX90a-NEXT:    v_readlane_b32 s41, v63, 5
-; GFX90a-NEXT:    v_readlane_b32 s40, v63, 4
+; GFX90a-NEXT:    v_readlane_b32 s67, v63, 15
+; GFX90a-NEXT:    v_readlane_b32 s66, v63, 14
+; GFX90a-NEXT:    v_readlane_b32 s65, v63, 13
+; GFX90a-NEXT:    v_readlane_b32 s64, v63, 12
+; GFX90a-NEXT:    v_readlane_b32 s55, v63, 11
+; GFX90a-NEXT:    v_readlane_b32 s54, v63, 10
+; GFX90a-NEXT:    v_readlane_b32 s53, v63, 9
+; GFX90a-NEXT:    v_readlane_b32 s52, v63, 8
+; GFX90a-NEXT:    v_readlane_b32 s51, v63, 7
+; GFX90a-NEXT:    v_readlane_b32 s50, v63, 6
+; GFX90a-NEXT:    v_readlane_b32 s49, v63, 5
+; GFX90a-NEXT:    v_readlane_b32 s48, v63, 4
 ; GFX90a-NEXT:    v_readlane_b32 s39, v63, 3
 ; GFX90a-NEXT:    v_readlane_b32 s38, v63, 2
 ; GFX90a-NEXT:    v_readlane_b32 s37, v63, 1

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir
index 4d6e33cf0b68a..b427c5bdd7229 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir
@@ -1,8 +1,7 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck %s
 
 # CHECK: csr_sgpr_spill
-# CHECK: spillPhysVGPRs
-# CHECK-NEXT: - '$vgpr0'
+# CHECK-NOT: spillPhysVGPRs
 ---
 name: csr_sgpr_spill
 tracksRegLiveness: true


        


More information about the llvm-commits mailing list