[llvm] 3395676 - [AMDGPU] Fold copies of constant physical registers into their uses (#154183)

via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 18 13:07:40 PDT 2025


Author: Stanislav Mekhanoshin
Date: 2025-08-18T13:07:36-07:00
New Revision: 3395676a18ab580f21ebcd4324feaf1294a8b6d9

URL: https://github.com/llvm/llvm-project/commit/3395676a18ab580f21ebcd4324feaf1294a8b6d9
DIFF: https://github.com/llvm/llvm-project/commit/3395676a18ab580f21ebcd4324feaf1294a8b6d9.diff

LOG: [AMDGPU] Fold copies of constant physical registers into their uses (#154183)

With current codegen this only affects src_flat_scratch_base_lo/hi.

Co-authored-by: Jay Foad <Jay.Foad at amd.com>

Co-authored-by: Jay Foad <Jay.Foad at amd.com>

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
    llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
    llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
    llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 962c276bc2123..66d1126eb4151 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -709,7 +709,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
   // 16-bit SGPRs instead of 32-bit ones.
   if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
     Old.setSubReg(AMDGPU::NoSubRegister);
-  Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
+  if (New->getReg().isPhysical())
+    Old.substPhysReg(New->getReg(), *TRI);
+  else
+    Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
   Old.setIsUndef(New->isUndef());
   return true;
 }
@@ -1986,7 +1989,9 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
   if (!FoldingImm && !OpToFold.isReg())
     return false;
 
-  if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
+  // Fold virtual registers and constant physical registers.
+  if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
+      !TRI->isConstantPhysReg(OpToFold.getReg()))
     return false;
 
   // Prevent folding operands backwards in the function. For example,

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
index 4b6375cc60800..b4b49e90dca02 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -9,15 +9,14 @@ target triple = "amdgcn-amd-amdhsa"
 define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0
-; GFX1250-SDAG-NEXT:    s_cmp_lg_u32 s2, -1
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_lshlrev_b32 v1, 20, v0
+; GFX1250-SDAG-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
@@ -27,20 +26,20 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ;
 ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX1250-GISEL-NEXT:    s_and_b32 s0, 1, s0
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, 1, s1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
-; GFX1250-GISEL-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
@@ -56,27 +55,24 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
 ; GFX1250-SDAG-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 20, v2
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    flat_store_b32 v[0:1], v3 scope:SCOPE_SYS
@@ -91,10 +87,9 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) {
 ; GFX1250-LABEL: use_flat_to_private_addrspacecast:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_sub_co_i32 s2, s0, s2
+; GFX1250-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-NEXT:    s_cselect_b32 s0, s2, -1
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
@@ -110,9 +105,8 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_sub_co_i32 s0, s0, s1
+; GFX1250-SDAG-NEXT:    s_sub_co_i32 s0, s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-SDAG-NEXT:    s_endpgm
@@ -122,9 +116,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s0, s1
+; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-GISEL-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
index 5fc9f4a0f8038..817e3f01c8cdd 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
@@ -534,58 +534,61 @@ define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX1250-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB34_6
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s1, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    s_cbranch_execnz .LBB34_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow2
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB34_8
+; GFX1250-NEXT:  .LBB34_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB34_3: ; %atomicrmw.check.private
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s1, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1250-NEXT:    s_cbranch_execz .LBB34_3
-; GFX1250-NEXT:  ; %bb.2: ; %atomicrmw.global
-; GFX1250-NEXT:    global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execz .LBB34_5
+; GFX1250-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX1250-NEXT:    global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB34_3: ; %Flow
+; GFX1250-NEXT:  .LBB34_5: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1250-NEXT:    s_cbranch_execz .LBB34_5
-; GFX1250-NEXT:  ; %bb.4: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB34_7
+; GFX1250-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s2, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
-; GFX1250-NEXT:  .LBB34_5: ; %Flow1
+; GFX1250-NEXT:    v_add_f64_e32 v[2:3], v[0:1], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
+; GFX1250-NEXT:  .LBB34_7: ; %Flow1
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB34_6: ; %Flow2
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB34_8
-; GFX1250-NEXT:  ; %bb.7: ; %atomicrmw.shared
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-NEXT:    s_cbranch_execz .LBB34_2
+; GFX1250-NEXT:  .LBB34_8: ; %atomicrmw.shared
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    ds_add_rtn_f64 v[4:5], v0, v[2:3]
-; GFX1250-NEXT:  .LBB34_8: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc_lo
+; GFX1250-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fadd ptr %ptr, double %val monotonic
   ret double %result
@@ -596,58 +599,61 @@ define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX1250-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB35_6
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s1, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow2
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB35_8
+; GFX1250-NEXT:  .LBB35_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB35_3: ; %atomicrmw.check.private
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s1, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1250-NEXT:    s_cbranch_execz .LBB35_3
-; GFX1250-NEXT:  ; %bb.2: ; %atomicrmw.global
-; GFX1250-NEXT:    global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execz .LBB35_5
+; GFX1250-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX1250-NEXT:    global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB35_3: ; %Flow
+; GFX1250-NEXT:  .LBB35_5: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1250-NEXT:    s_cbranch_execz .LBB35_5
-; GFX1250-NEXT:  ; %bb.4: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB35_7
+; GFX1250-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s2, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
-; GFX1250-NEXT:  .LBB35_5: ; %Flow1
+; GFX1250-NEXT:    v_add_f64_e32 v[2:3], v[0:1], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
+; GFX1250-NEXT:  .LBB35_7: ; %Flow1
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB35_6: ; %Flow2
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB35_8
-; GFX1250-NEXT:  ; %bb.7: ; %atomicrmw.shared
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-NEXT:    s_cbranch_execz .LBB35_2
+; GFX1250-NEXT:  .LBB35_8: ; %atomicrmw.shared
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    ds_add_rtn_f64 v[4:5], v0, v[2:3]
-; GFX1250-NEXT:  .LBB35_8: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc_lo
+; GFX1250-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fadd ptr %ptr, double %val syncscope("one-as") monotonic
   ret double %result
@@ -682,40 +688,42 @@ define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB38_2
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT:    flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execnz .LBB38_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB38_4
+; GFX1250-NEXT:  .LBB38_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB38_3: ; %atomicrmw.global
+; GFX1250-NEXT:    flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB38_2: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB38_4
-; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB38_2
+; GFX1250-NEXT:  .LBB38_4: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
-; GFX1250-NEXT:  .LBB38_4: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT:    v_min_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, double %val monotonic
   ret double %result
@@ -726,40 +734,42 @@ define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB39_2
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT:    flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execnz .LBB39_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB39_4
+; GFX1250-NEXT:  .LBB39_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB39_3: ; %atomicrmw.global
+; GFX1250-NEXT:    flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB39_2: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB39_4
-; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB39_2
+; GFX1250-NEXT:  .LBB39_4: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
-; GFX1250-NEXT:  .LBB39_4: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT:    v_min_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, double %val syncscope("one-as") monotonic
   ret double %result
@@ -794,40 +804,42 @@ define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB42_2
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT:    flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execnz .LBB42_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB42_4
+; GFX1250-NEXT:  .LBB42_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB42_3: ; %atomicrmw.global
+; GFX1250-NEXT:    flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB42_2: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB42_4
-; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB42_2
+; GFX1250-NEXT:  .LBB42_4: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
-; GFX1250-NEXT:  .LBB42_4: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, double %val monotonic
   ret double %result
@@ -838,40 +850,42 @@ define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB43_2
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT:    flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX1250-NEXT:  .LBB43_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB43_3: ; %atomicrmw.global
+; GFX1250-NEXT:    flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB43_2: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB43_4
-; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB43_2
+; GFX1250-NEXT:  .LBB43_4: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
-; GFX1250-NEXT:  .LBB43_4: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, double %val syncscope("one-as") monotonic
   ret double %result
@@ -978,13 +992,11 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB52_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -996,10 +1008,9 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB52_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1021,13 +1032,11 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB53_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1039,10 +1048,9 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB53_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1064,13 +1072,11 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB54_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1082,10 +1088,9 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1107,13 +1112,11 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB55_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1125,10 +1128,9 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB55_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1150,13 +1152,11 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB56_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1168,10 +1168,9 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB56_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1193,13 +1192,11 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB57_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1211,10 +1208,9 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB57_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1236,13 +1232,11 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1254,10 +1248,9 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB58_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1279,13 +1272,11 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1297,10 +1288,9 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB59_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 004d3c0c1cf53..265848b441f69 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -252,11 +252,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -279,9 +278,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1250-SDAG-NEXT:  .LBB10_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
@@ -297,12 +295,11 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -325,9 +322,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1250-GISEL-NEXT:  .LBB10_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
@@ -354,13 +350,12 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB11_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -379,9 +374,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1250-SDAG-NEXT:  .LBB11_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
@@ -397,7 +391,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -405,7 +398,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -428,9 +421,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1250-GISEL-NEXT:  .LBB11_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
@@ -454,11 +446,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -478,9 +469,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1250-SDAG-NEXT:  .LBB12_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE
@@ -490,13 +480,12 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB12_3
@@ -515,9 +504,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1250-GISEL-NEXT:  .LBB12_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
@@ -537,11 +525,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB13_3
@@ -560,9 +546,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1250-SDAG-NEXT:  .LBB13_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE
@@ -572,16 +557,15 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB13_3
@@ -600,9 +584,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1250-GISEL-NEXT:  .LBB13_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
@@ -680,11 +663,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -707,9 +689,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1250-SDAG-NEXT:  .LBB18_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -725,12 +706,11 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -753,9 +733,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1250-GISEL-NEXT:  .LBB18_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -782,13 +761,12 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB19_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -807,9 +785,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX1250-SDAG-NEXT:  .LBB19_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -825,7 +802,6 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -833,7 +809,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -856,9 +832,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX1250-GISEL-NEXT:  .LBB19_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -882,11 +857,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -906,9 +880,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1250-SDAG-NEXT:  .LBB20_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -921,13 +894,12 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB20_3
@@ -946,9 +918,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1250-GISEL-NEXT:  .LBB20_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -971,11 +942,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB21_3
@@ -994,9 +963,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX1250-SDAG-NEXT:  .LBB21_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1009,16 +977,15 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB21_3
@@ -1037,9 +1004,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX1250-GISEL-NEXT:  .LBB21_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -1120,11 +1086,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1147,9 +1112,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1250-SDAG-NEXT:  .LBB26_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1165,12 +1129,11 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1193,9 +1156,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1250-GISEL-NEXT:  .LBB26_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -1222,13 +1184,12 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB27_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -1247,9 +1208,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX1250-SDAG-NEXT:  .LBB27_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1265,7 +1225,6 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -1273,7 +1232,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1296,9 +1255,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX1250-GISEL-NEXT:  .LBB27_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -1322,11 +1280,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -1346,9 +1303,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX1250-SDAG-NEXT:  .LBB28_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1361,13 +1317,12 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB28_3
@@ -1386,9 +1341,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX1250-GISEL-NEXT:  .LBB28_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -1411,11 +1365,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB29_3
@@ -1434,9 +1386,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1250-SDAG-NEXT:  .LBB29_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1449,16 +1400,15 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB29_3
@@ -1477,9 +1427,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1250-GISEL-NEXT:  .LBB29_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -1560,11 +1509,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1587,9 +1535,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB34_2
 ; GFX1250-SDAG-NEXT:  .LBB34_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1606,12 +1553,11 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1634,9 +1580,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB34_2
 ; GFX1250-GISEL-NEXT:  .LBB34_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -1664,13 +1609,12 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB35_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -1689,9 +1633,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB35_2
 ; GFX1250-SDAG-NEXT:  .LBB35_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1708,7 +1651,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -1716,7 +1658,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1739,9 +1681,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB35_2
 ; GFX1250-GISEL-NEXT:  .LBB35_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -1766,11 +1707,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -1790,9 +1730,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB36_2
 ; GFX1250-SDAG-NEXT:  .LBB36_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1806,13 +1745,12 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB36_3
@@ -1831,9 +1769,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB36_2
 ; GFX1250-GISEL-NEXT:  .LBB36_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -1857,11 +1794,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB37_3
@@ -1880,9 +1815,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB37_2
 ; GFX1250-SDAG-NEXT:  .LBB37_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1896,16 +1830,15 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB37_3
@@ -1924,9 +1857,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB37_2
 ; GFX1250-GISEL-NEXT:  .LBB37_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -2008,11 +1940,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2035,9 +1966,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX1250-SDAG-NEXT:  .LBB42_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2054,12 +1984,11 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2082,9 +2011,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX1250-GISEL-NEXT:  .LBB42_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -2112,13 +2040,12 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB43_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -2137,9 +2064,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB43_2
 ; GFX1250-SDAG-NEXT:  .LBB43_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2156,7 +2082,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -2164,7 +2089,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2187,9 +2112,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB43_2
 ; GFX1250-GISEL-NEXT:  .LBB43_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -2214,11 +2138,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2238,9 +2161,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB44_2
 ; GFX1250-SDAG-NEXT:  .LBB44_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2254,13 +2176,12 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB44_3
@@ -2279,9 +2200,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB44_2
 ; GFX1250-GISEL-NEXT:  .LBB44_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -2305,11 +2225,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB45_3
@@ -2328,9 +2246,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB45_2
 ; GFX1250-SDAG-NEXT:  .LBB45_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2344,16 +2261,15 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB45_3
@@ -2372,9 +2288,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB45_2
 ; GFX1250-GISEL-NEXT:  .LBB45_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -2456,11 +2371,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2483,9 +2397,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX1250-SDAG-NEXT:  .LBB50_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2502,12 +2415,11 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2530,9 +2442,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX1250-GISEL-NEXT:  .LBB50_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -2560,13 +2471,12 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB51_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -2585,9 +2495,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB51_2
 ; GFX1250-SDAG-NEXT:  .LBB51_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2604,7 +2513,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -2612,7 +2520,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2635,9 +2543,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB51_2
 ; GFX1250-GISEL-NEXT:  .LBB51_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -2662,11 +2569,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2686,9 +2592,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB52_2
 ; GFX1250-SDAG-NEXT:  .LBB52_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2702,13 +2607,12 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB52_3
@@ -2727,9 +2631,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB52_2
 ; GFX1250-GISEL-NEXT:  .LBB52_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -2753,11 +2656,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB53_3
@@ -2776,9 +2677,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB53_2
 ; GFX1250-SDAG-NEXT:  .LBB53_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2792,16 +2692,15 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB53_3
@@ -2820,9 +2719,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB53_2
 ; GFX1250-GISEL-NEXT:  .LBB53_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -2898,11 +2796,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2925,10 +2822,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-SDAG-NEXT:  .LBB58_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2944,12 +2840,11 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2972,10 +2867,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-GISEL-NEXT:  .LBB58_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -3002,13 +2896,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB59_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -3027,10 +2920,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-SDAG-NEXT:  .LBB59_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3046,7 +2938,6 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -3054,7 +2945,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3077,10 +2968,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-GISEL-NEXT:  .LBB59_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -3104,11 +2994,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -3127,9 +3016,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB60_2
 ; GFX1250-SDAG-NEXT:  .LBB60_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3142,13 +3030,12 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB60_3
@@ -3166,9 +3053,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB60_2
 ; GFX1250-GISEL-NEXT:  .LBB60_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -3191,11 +3077,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB61_3
@@ -3213,9 +3097,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB61_2
 ; GFX1250-SDAG-NEXT:  .LBB61_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3228,16 +3111,15 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB61_3
@@ -3255,9 +3137,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB61_2
 ; GFX1250-GISEL-NEXT:  .LBB61_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -3332,11 +3213,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -3359,10 +3239,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX1250-SDAG-NEXT:  .LBB66_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3378,12 +3257,11 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3406,10 +3284,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX1250-GISEL-NEXT:  .LBB66_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -3436,13 +3313,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB67_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -3461,10 +3337,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX1250-SDAG-NEXT:  .LBB67_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3480,7 +3355,6 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -3488,7 +3362,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3511,10 +3385,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX1250-GISEL-NEXT:  .LBB67_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -3538,11 +3411,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -3561,9 +3433,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB68_2
 ; GFX1250-SDAG-NEXT:  .LBB68_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3576,13 +3447,12 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB68_3
@@ -3600,9 +3470,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB68_2
 ; GFX1250-GISEL-NEXT:  .LBB68_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -3625,11 +3494,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB69_3
@@ -3647,9 +3514,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB69_2
 ; GFX1250-SDAG-NEXT:  .LBB69_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3662,16 +3528,15 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB69_3
@@ -3689,9 +3554,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB69_2
 ; GFX1250-GISEL-NEXT:  .LBB69_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -3766,11 +3630,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -3793,10 +3656,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX1250-SDAG-NEXT:  .LBB74_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3812,12 +3674,11 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3840,10 +3701,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX1250-GISEL-NEXT:  .LBB74_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -3870,13 +3730,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB75_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -3895,10 +3754,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB75_2
 ; GFX1250-SDAG-NEXT:  .LBB75_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3914,7 +3772,6 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -3922,7 +3779,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3945,10 +3802,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB75_2
 ; GFX1250-GISEL-NEXT:  .LBB75_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -3972,11 +3828,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -3995,9 +3850,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB76_2
 ; GFX1250-SDAG-NEXT:  .LBB76_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4010,13 +3864,12 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB76_3
@@ -4034,9 +3887,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB76_2
 ; GFX1250-GISEL-NEXT:  .LBB76_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -4059,11 +3911,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB77_3
@@ -4081,9 +3931,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB77_2
 ; GFX1250-SDAG-NEXT:  .LBB77_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4096,16 +3945,15 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB77_3
@@ -4123,9 +3971,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB77_2
 ; GFX1250-GISEL-NEXT:  .LBB77_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -4200,11 +4047,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -4227,10 +4073,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX1250-SDAG-NEXT:  .LBB82_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4246,12 +4091,11 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4274,10 +4118,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX1250-GISEL-NEXT:  .LBB82_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -4304,13 +4147,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB83_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -4329,10 +4171,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB83_2
 ; GFX1250-SDAG-NEXT:  .LBB83_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4348,7 +4189,6 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -4356,7 +4196,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4379,10 +4219,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB83_2
 ; GFX1250-GISEL-NEXT:  .LBB83_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -4406,11 +4245,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -4429,9 +4267,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB84_2
 ; GFX1250-SDAG-NEXT:  .LBB84_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4444,13 +4281,12 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB84_3
@@ -4468,9 +4304,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB84_2
 ; GFX1250-GISEL-NEXT:  .LBB84_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -4493,11 +4328,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB85_3
@@ -4515,9 +4348,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB85_2
 ; GFX1250-SDAG-NEXT:  .LBB85_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4530,16 +4362,15 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB85_3
@@ -4557,9 +4388,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB85_2
 ; GFX1250-GISEL-NEXT:  .LBB85_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -4654,12 +4484,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v3
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -4684,9 +4513,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX1250-SDAG-NEXT:  .LBB90_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v8, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v8, off
@@ -4704,12 +4532,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v5
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4734,9 +4561,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX1250-GISEL-NEXT:  .LBB90_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4766,13 +4592,12 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v3
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB91_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -4793,9 +4618,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB91_2
 ; GFX1250-SDAG-NEXT:  .LBB91_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v8, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v8, off
@@ -4813,7 +4637,6 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -4821,7 +4644,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4846,9 +4669,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB91_2
 ; GFX1250-GISEL-NEXT:  .LBB91_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4873,13 +4695,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v2
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -4901,9 +4722,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX1250-SDAG-NEXT:  .LBB92_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -4918,13 +4738,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB92_3
@@ -4945,9 +4764,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX1250-GISEL-NEXT:  .LBB92_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -4972,11 +4790,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v2
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB93_3
@@ -4997,9 +4813,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB93_2
 ; GFX1250-SDAG-NEXT:  .LBB93_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -5014,16 +4829,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB93_3
@@ -5044,9 +4858,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB93_2
 ; GFX1250-GISEL-NEXT:  .LBB93_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -5120,11 +4933,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -5146,10 +4958,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX1250-SDAG-NEXT:  .LBB98_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5167,12 +4978,11 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5194,10 +5004,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX1250-GISEL-NEXT:  .LBB98_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5228,13 +5037,12 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB99_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -5252,10 +5060,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB99_2
 ; GFX1250-SDAG-NEXT:  .LBB99_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5273,7 +5080,6 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -5281,7 +5087,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5303,10 +5109,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB99_2
 ; GFX1250-GISEL-NEXT:  .LBB99_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5334,11 +5139,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -5356,9 +5160,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB100_2
 ; GFX1250-SDAG-NEXT:  .LBB100_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5373,13 +5176,12 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB100_3
@@ -5396,9 +5198,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB100_2
 ; GFX1250-GISEL-NEXT:  .LBB100_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5425,11 +5226,9 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB101_3
@@ -5446,9 +5245,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB101_2
 ; GFX1250-SDAG-NEXT:  .LBB101_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5463,16 +5261,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB101_3
@@ -5489,9 +5286,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB101_2
 ; GFX1250-GISEL-NEXT:  .LBB101_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5569,11 +5365,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -5595,10 +5390,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX1250-SDAG-NEXT:  .LBB106_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5619,12 +5413,11 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5646,10 +5439,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX1250-GISEL-NEXT:  .LBB106_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5681,13 +5473,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB107_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -5705,10 +5496,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB107_2
 ; GFX1250-SDAG-NEXT:  .LBB107_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5729,7 +5519,6 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -5737,7 +5526,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5759,10 +5548,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB107_2
 ; GFX1250-GISEL-NEXT:  .LBB107_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v6
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -5791,11 +5579,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -5813,9 +5600,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB108_2
 ; GFX1250-SDAG-NEXT:  .LBB108_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -5833,13 +5619,12 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB108_3
@@ -5856,9 +5641,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB108_2
 ; GFX1250-GISEL-NEXT:  .LBB108_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -5886,11 +5670,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB109_3
@@ -5907,9 +5689,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB109_2
 ; GFX1250-SDAG-NEXT:  .LBB109_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -5927,16 +5708,15 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB109_3
@@ -5953,9 +5733,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB109_2
 ; GFX1250-GISEL-NEXT:  .LBB109_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 9e1815b48abfd..f4e88b4b564eb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -2,10 +2,12 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
 
 define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; SI-LABEL: is_private_vgpr:
@@ -57,6 +59,21 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: is_private_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
+;
 ; CI-GISEL-LABEL: is_private_vgpr:
 ; CI-GISEL:       ; %bb.0:
 ; CI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -170,6 +187,23 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX9-SDAG-NEXT:  .LBB1_2: ; %bb1
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
+; GFX1250-SDAG-LABEL: is_private_sgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_cmp_lt_u32 s0, 0x4000000
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX1250-SDAG-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %bb0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:  .LBB1_2: ; %bb1
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
 ; CI-GISEL-LABEL: is_private_sgpr:
 ; CI-GISEL:       ; %bb.0:
 ; CI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -229,6 +263,21 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: is_private_sgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, s1, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_cmp_ge_u32 s0, 0x4000000
+; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %bb0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:  .LBB1_2: ; %bb1
+; GFX1250-GISEL-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
 

diff  --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index 725d57d852966..788cdd1c89051 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -337,11 +337,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
 ; SDAG:       ; %bb.0: ; %entry
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
-; SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; SDAG-NEXT:    v_xor_b32_e32 v0, s0, v3
+; SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -363,10 +361,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; SDAG-NEXT:    s_cbranch_execz .LBB21_2
 ; SDAG-NEXT:  .LBB21_4: ; %atomicrmw.private
-; SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
 ; SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
+; SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -380,7 +377,6 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ;
 ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -390,7 +386,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v0
 ; GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT:    v_xor_b32_e32 v0, s2, v5
+; GISEL-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
@@ -412,10 +408,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s2
 ; GISEL-NEXT:    s_cbranch_execz .LBB21_2
 ; GISEL-NEXT:  .LBB21_4: ; %atomicrmw.private
-; GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off


        


More information about the llvm-commits mailing list