[llvm] 76efbc0 - [AMDGPU] Fix codegen to emit COPY instead of S_MOV_B64 for aperture regs (#158754)

via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 16 02:26:37 PDT 2025


Author: Stanislav Mekhanoshin
Date: 2025-09-16T02:26:32-07:00
New Revision: 76efbc068a6ab6ffa10b801df07155a6a69530e7

URL: https://github.com/llvm/llvm-project/commit/76efbc068a6ab6ffa10b801df07155a6a69530e7
DIFF: https://github.com/llvm/llvm-project/commit/76efbc068a6ab6ffa10b801df07155a6a69530e7.diff

LOG: [AMDGPU] Fix codegen to emit COPY instead of S_MOV_B64 for aperture regs (#158754)

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
    llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
    llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
    llvm/test/CodeGen/AMDGPU/addrspacecast.ll
    llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
    llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
    llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
    llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
    llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
    llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
    llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
    llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
    llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
    llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
    llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
    llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index d8c4cbbc4fa33..c690b2b7129b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2293,16 +2293,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
     assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
             !ST.hasGloballyAddressableScratch()) &&
            "Cannot use src_private_base with globally addressable scratch!");
-    // FIXME: It would be more natural to emit a COPY here, but then copy
-    // coalescing would kick in and it would think it's okay to use the "HI"
-    // subregister (instead of extracting the HI 32 bits) which is an artificial
-    // (unusable) register.
-    //  Register TableGen definitions would need an overhaul to get rid of the
-    //  artificial "HI" aperture registers and prevent this kind of issue from
-    //  happening.
     Register Dst = MRI.createGenericVirtualRegister(S64);
     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
-    B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
+    B.buildCopy({Dst}, {Register(ApertureRegNo)});
     return B.buildUnmerge(S32, Dst).getReg(1);
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6a4df5eeb9779..1982295dde909 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8159,25 +8159,14 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
     // it returns a wrong value (all zeroes?). The real value is in the upper 32
     // bits.
     //
-    // To work around the issue, directly emit a 64 bit mov from this register
+    // To work around the issue, emit a 64 bit copy from this register
     // then extract the high bits. Note that this shouldn't even result in a
     // shift being emitted and simply become a pair of registers (e.g.):
     //    s_mov_b64 s[6:7], src_shared_base
     //    v_mov_b32_e32 v1, s7
-    //
-    // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
-    // coalescing would kick in and it would think it's okay to use the "HI"
-    // subregister directly (instead of extracting the HI 32 bits) which is an
-    // artificial (unusable) register.
-    //  Register TableGen definitions would need an overhaul to get rid of the
-    //  artificial "HI" aperture registers and prevent this kind of issue from
-    //  happening.
-    SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
-                                     DAG.getRegister(ApertureRegNo, MVT::i64));
-    return DAG.getNode(
-        ISD::TRUNCATE, DL, MVT::i32,
-        DAG.getNode(ISD::SRL, DL, MVT::i64,
-                    {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
+    SDValue Copy =
+        DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
+    return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
   }
 
   // For code object version 5, private_base and shared_base are passed through

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index b520ce1826ec9..3d224f2f6bf05 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -9,12 +9,11 @@
 define amdgpu_ps void @amdgpu_ps() {
 ; MESA-LABEL: amdgpu_ps:
 ; MESA:       ; %bb.0:
-; MESA-NEXT:    s_add_u32 flat_scratch_lo, s2, s4
-; MESA-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; MESA-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; MESA-NEXT:    s_mov_b32 s0, 0
-; MESA-NEXT:    s_mov_b64 s[2:3], src_private_base
-; MESA-NEXT:    s_mov_b32 s1, s3
+; MESA-NEXT:    s_add_u32 flat_scratch_lo, s2, s4
 ; MESA-NEXT:    v_mov_b32_e32 v0, s0
+; MESA-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; MESA-NEXT:    v_mov_b32_e32 v2, 0
 ; MESA-NEXT:    v_mov_b32_e32 v1, s1
 ; MESA-NEXT:    flat_store_dword v[0:1], v2
@@ -30,11 +29,10 @@ define amdgpu_ps void @amdgpu_ps() {
 ; PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
-; PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; PAL-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; PAL-NEXT:    s_mov_b32 s0, 0
-; PAL-NEXT:    s_mov_b64 s[2:3], src_private_base
-; PAL-NEXT:    s_mov_b32 s1, s3
 ; PAL-NEXT:    v_mov_b32_e32 v0, s0
+; PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; PAL-NEXT:    flat_store_dword v[0:1], v2
 ; PAL-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 86766e2904619..9539ec465e02f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -65,52 +65,52 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
-; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_mov_b32 s2, s0
-; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9V4-NEXT:    s_mov_b32 s0, s4
+; GFX9V4-NEXT:    s_cmp_lg_u32 s4, -1
+; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9V4-NEXT:    s_mov_b32 s2, s5
+; GFX9V4-NEXT:    s_cmp_lg_u32 s5, -1
+; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9V4-NEXT:    s_mov_b32 s4, s1
-; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9V4-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
-; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, 2
-; GFX9V4-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V4-NEXT:    s_endpgm
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V5-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
-; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_mov_b32 s2, s0
-; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9V5-NEXT:    s_mov_b32 s0, s4
+; GFX9V5-NEXT:    s_cmp_lg_u32 s4, -1
+; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9V5-NEXT:    s_mov_b32 s2, s5
+; GFX9V5-NEXT:    s_cmp_lg_u32 s5, -1
+; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9V5-NEXT:    s_mov_b32 s4, s1
-; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9V5-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
-; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, 2
-; GFX9V5-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
@@ -150,10 +150,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX9V4-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V4-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
@@ -162,10 +162,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX9V5-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V5-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
@@ -206,10 +206,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_is_private:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9V4-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V4-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
@@ -218,10 +218,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_is_private:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9V5-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V5-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
index d69a3e1a15bbd..4471980c1ba1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
@@ -158,8 +158,8 @@ body: |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
-    ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_private_base
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -227,8 +227,8 @@ body: |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3)
-    ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -380,16 +380,16 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>)
     ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
-    ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
-    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base
+    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV3]](s32)
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]]
     ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]]
     ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
-    ; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
-    ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_1]](s64)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base
+    ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64)
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[UV5]](s32)
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]]
@@ -517,8 +517,8 @@ body: |
     ; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0
     ; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
     ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
-    ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s64) = COPY $src_private_base
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0)
     %0:_(p5) = G_FRAME_INDEX %stack.0

diff  --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
index 58f3ffb0492e0..bc341f2baa804 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
@@ -361,8 +361,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -417,8 +417,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__v(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -473,8 +473,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -538,13 +538,13 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 {
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -603,13 +603,13 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__v(ptr %ptr) #0 {
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
@@ -659,12 +659,12 @@ define void @flat_atomic_cmpxchg_i64_ret_v_a__v(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -717,12 +717,12 @@ define void @flat_atomic_cmpxchg_i64_ret_a_v__v(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a1
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -775,8 +775,8 @@ define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -836,8 +836,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_v__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -892,8 +892,8 @@ define void @flat_atomic_cmpxchg_i64_ret_v_av__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -948,12 +948,12 @@ define void @flat_atomic_cmpxchg_i64_ret_av_a__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -1006,12 +1006,12 @@ define void @flat_atomic_cmpxchg_i64_ret_a_av__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a1
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART

diff  --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 6b6eb43baf856..d053425afbb6d 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -641,12 +641,12 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -686,15 +686,15 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[2:3]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a3
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -739,12 +739,12 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -782,15 +782,15 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_v:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -833,8 +833,8 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -876,9 +876,9 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -927,8 +927,8 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -968,9 +968,9 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[4:5]
@@ -1017,8 +1017,8 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -1058,9 +1058,9 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_v:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[4:5]
@@ -1107,8 +1107,8 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -1150,9 +1150,9 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -1201,12 +1201,12 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1244,15 +1244,15 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1295,8 +1295,8 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -1336,9 +1336,9 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[4:5]
@@ -1383,11 +1383,11 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xchg_i64_noret_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2487,11 +2487,11 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -2549,13 +2549,13 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -2616,11 +2616,11 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_v:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
@@ -2674,13 +2674,13 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_v:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3221,11 +3221,11 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
@@ -3279,13 +3279,13 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3458,11 +3458,11 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_noret_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3516,13 +3516,13 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_expansion_i64_noret_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX950-NEXT:    s_cbranch_execnz .LBB40_3
@@ -4304,11 +4304,11 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -4350,13 +4350,13 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4403,11 +4403,11 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_v:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -4445,13 +4445,13 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_i64_ret_a_v:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4858,11 +4858,11 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -4900,13 +4900,13 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_i64_ret_a_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5035,11 +5035,11 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_i64_noret_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5077,13 +5077,13 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_i64_noret_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX950-NEXT:    s_cbranch_execnz .LBB61_3
@@ -6306,12 +6306,12 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -6352,15 +6352,15 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_add_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6404,8 +6404,8 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -6443,9 +6443,9 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_add_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[4:5]
@@ -6489,12 +6489,12 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -6535,15 +6535,15 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6589,8 +6589,8 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -6628,9 +6628,9 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_sub_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -6676,12 +6676,12 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -6722,15 +6722,15 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_and_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6775,8 +6775,8 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -6814,9 +6814,9 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_and_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -6861,12 +6861,12 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -6925,15 +6925,15 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_nand_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6996,8 +6996,8 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
@@ -7054,9 +7054,9 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_nand_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -7119,12 +7119,12 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7165,15 +7165,15 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_or_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7218,8 +7218,8 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -7257,9 +7257,9 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_or_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -7304,12 +7304,12 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7351,15 +7351,15 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_max_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7406,8 +7406,8 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -7446,9 +7446,9 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_max_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -7495,12 +7495,12 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7542,15 +7542,15 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_min_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7597,8 +7597,8 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -7637,9 +7637,9 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_min_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -7686,12 +7686,12 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7733,15 +7733,15 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7788,8 +7788,8 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -7828,9 +7828,9 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_umax_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -7877,12 +7877,12 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7924,15 +7924,15 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7979,8 +7979,8 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -8019,9 +8019,9 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_umin_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -8068,12 +8068,12 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -8117,15 +8117,15 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8172,8 +8172,8 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -8214,9 +8214,9 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -8264,12 +8264,12 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -8315,15 +8315,15 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8372,8 +8372,8 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -8416,9 +8416,9 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -8467,12 +8467,12 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -8533,15 +8533,15 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8609,8 +8609,8 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
@@ -8669,9 +8669,9 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -8740,12 +8740,12 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -8806,15 +8806,15 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8882,8 +8882,8 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -8942,9 +8942,9 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -9017,8 +9017,8 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 40, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a0
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -9100,8 +9100,8 @@ define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 40, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v3
@@ -9818,12 +9818,12 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -9884,15 +9884,15 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -9958,8 +9958,8 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -10015,9 +10015,9 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fadd_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[4:5]
@@ -10081,12 +10081,12 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -10139,15 +10139,15 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fsub_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10204,8 +10204,8 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -10255,9 +10255,9 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fsub_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -10314,12 +10314,12 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -10361,15 +10361,15 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10415,8 +10415,8 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -10455,9 +10455,9 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmax_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -10503,12 +10503,12 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -10550,15 +10550,15 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10604,8 +10604,8 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -10644,9 +10644,9 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmin_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -10692,12 +10692,12 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -10758,15 +10758,15 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10832,8 +10832,8 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
@@ -10891,9 +10891,9 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -10960,12 +10960,12 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -11026,15 +11026,15 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fminimum_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11100,8 +11100,8 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
@@ -11159,9 +11159,9 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fminimum_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -14340,8 +14340,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -14383,8 +14383,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -14430,8 +14430,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -14467,8 +14467,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -14508,8 +14508,8 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -14555,8 +14555,8 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -14604,8 +14604,8 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -14643,8 +14643,8 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -14685,8 +14685,8 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -14732,8 +14732,8 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -14783,8 +14783,8 @@ define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -14822,8 +14822,8 @@ define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -14866,8 +14866,8 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -14913,8 +14913,8 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -14963,8 +14963,8 @@ define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15002,8 +15002,8 @@ define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15045,8 +15045,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15109,8 +15109,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15176,8 +15176,8 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15234,8 +15234,8 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15295,8 +15295,8 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15342,8 +15342,8 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15392,8 +15392,8 @@ define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15431,8 +15431,8 @@ define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15474,8 +15474,8 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15521,8 +15521,8 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15571,8 +15571,8 @@ define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15610,8 +15610,8 @@ define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15653,8 +15653,8 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15701,8 +15701,8 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15753,8 +15753,8 @@ define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15793,8 +15793,8 @@ define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15838,8 +15838,8 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15886,8 +15886,8 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15938,8 +15938,8 @@ define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15978,8 +15978,8 @@ define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16023,8 +16023,8 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16071,8 +16071,8 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16123,8 +16123,8 @@ define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16163,8 +16163,8 @@ define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16208,8 +16208,8 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16256,8 +16256,8 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16308,8 +16308,8 @@ define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16348,8 +16348,8 @@ define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16393,8 +16393,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16443,8 +16443,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16495,8 +16495,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16537,8 +16537,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16583,8 +16583,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16635,8 +16635,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16689,8 +16689,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16733,8 +16733,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16780,8 +16780,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16846,8 +16846,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16918,8 +16918,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16978,8 +16978,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -17045,8 +17045,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -17111,8 +17111,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -17183,8 +17183,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -17243,8 +17243,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -17314,8 +17314,8 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 40
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_shared_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17396,8 +17396,8 @@ define void @flat_atomic_fadd_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 40
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_shared_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18141,8 +18141,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_shared_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -18205,8 +18205,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -18272,8 +18272,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_shared_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18327,8 +18327,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -18386,8 +18386,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -18444,8 +18444,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -18505,8 +18505,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18556,8 +18556,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -18611,8 +18611,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -18659,8 +18659,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -18710,8 +18710,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18750,8 +18750,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -18794,8 +18794,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -18842,8 +18842,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -18893,8 +18893,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18933,8 +18933,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -18977,8 +18977,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -19043,8 +19043,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -19113,8 +19113,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -19172,8 +19172,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -19237,8 +19237,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -19303,8 +19303,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -19373,8 +19373,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -19432,8 +19432,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index a6a0a9a3c9015..4df82946343b5 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -8,8 +8,8 @@ target triple = "amdgcn-amd-amdhsa"
 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
 ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
-; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
-; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
+; CI-DAG: s_cselect_b32 s[[SHI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0
 
 ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
 
@@ -17,10 +17,13 @@ target triple = "amdgcn-amd-amdhsa"
 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
 
 ; GFX9: s_cmp_lg_u32 [[PTR]], -1
-; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
-; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
+; GFX9-DAG: s_cselect_b32 s[[SHI:[0-9]+]], s[[HIBASE]], 0
+; GFX9-DAG: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0
 
-; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
+; HSA-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+
+; HSA: flat_store_dword v[[[VLO]]:[[VHI]]], [[K]]
 
 ; HSA:  .amdhsa_user_sgpr_private_segment_buffer 1
 ; HSA:  .amdhsa_user_sgpr_dispatch_ptr 0
@@ -68,18 +71,21 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
 
 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
-; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
-; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
+; CI-DAG: s_cselect_b32 s[[SHI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0
 
 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
 ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
 
 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; GFX9: s_cmp_lg_u32 [[PTR]], -1
-; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
-; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
+; GFX9: s_cselect_b32 s[[SHI:[0-9]+]], s[[HIBASE]], 0
+; GFX9: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0
 
-; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
+; HSA-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; HSA-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+
+; HSA: flat_store_dword v[[[VLO]]:[[VHI]]], [[K]]
 
 ; HSA:  .amdhsa_user_sgpr_private_segment_buffer 1
 ; HSA:  .amdhsa_user_sgpr_dispatch_ptr 0

diff  --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index e4323999d19c3..3c316f4acedb7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -456,12 +456,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT:    s_mov_b32 s6, 32
-; GFX908-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX908-NEXT:    s_getpc_b64 s[6:7]
 ; GFX908-NEXT:    s_add_u32 s6, s6, global at rel32@lo+4
 ; GFX908-NEXT:    s_addc_u32 s7, s7, global at rel32@hi+12
-; GFX908-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX908-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX908-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX908-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], -1
@@ -507,12 +505,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    s_mov_b32 s6, 32
-; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX90A-NEXT:    s_getpc_b64 s[6:7]
 ; GFX90A-NEXT:    s_add_u32 s6, s6, global at rel32@lo+4
 ; GFX90A-NEXT:    s_addc_u32 s7, s7, global at rel32@hi+12
-; GFX90A-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX90A-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX90A-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], -1
@@ -558,12 +554,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX942-NEXT:    s_mov_b32 s2, 32
-; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX942-NEXT:    s_getpc_b64 s[2:3]
 ; GFX942-NEXT:    s_add_u32 s2, s2, global at rel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s3, s3, global at rel32@hi+12
-; GFX942-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX942-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[0:1], -1
@@ -607,12 +601,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX1100-NEXT:    s_mov_b32 s2, 32
-; GFX1100-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX1100-NEXT:    s_getpc_b64 s[2:3]
 ; GFX1100-NEXT:    s_add_u32 s2, s2, global at rel32@lo+4
 ; GFX1100-NEXT:    s_addc_u32 s3, s3, global at rel32@hi+12
-; GFX1100-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1100-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1100-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX1100-NEXT:    s_mov_b32 s0, -1
@@ -660,9 +652,6 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX1200-NEXT:    s_wait_bvhcnt 0x0
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX1200-NEXT:    s_mov_b32 s2, 32
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX1200-NEXT:    s_getpc_b64 s[2:3]
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_sext_i32_i16 s3, s3
@@ -670,7 +659,7 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_add_co_ci_u32 s3, s3, global at rel32@hi+24
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1200-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1200-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -731,12 +720,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT:    s_mov_b32 s6, 32
-; GFX908-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX908-NEXT:    s_getpc_b64 s[6:7]
 ; GFX908-NEXT:    s_add_u32 s6, s6, global at rel32@lo+4
 ; GFX908-NEXT:    s_addc_u32 s7, s7, global at rel32@hi+12
-; GFX908-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX908-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX908-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX908-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], -1
@@ -800,12 +787,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT:    s_mov_b32 s6, 32
-; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX90A-NEXT:    s_getpc_b64 s[6:7]
 ; GFX90A-NEXT:    s_add_u32 s6, s6, global at rel32@lo+4
 ; GFX90A-NEXT:    s_addc_u32 s7, s7, global at rel32@hi+12
-; GFX90A-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX90A-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX90A-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], -1
@@ -825,12 +810,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX90A-NEXT:    s_branch .LBB5_10
 ; GFX90A-NEXT:  .LBB5_3: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    s_mov_b32 s6, 32
-; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX90A-NEXT:    s_getpc_b64 s[6:7]
 ; GFX90A-NEXT:    s_add_u32 s6, s6, global at rel32@lo+4
 ; GFX90A-NEXT:    s_addc_u32 s7, s7, global at rel32@hi+12
-; GFX90A-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX90A-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX90A-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], -1
@@ -896,12 +879,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
-; GFX942-NEXT:    s_mov_b32 s2, 32
-; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX942-NEXT:    s_getpc_b64 s[2:3]
 ; GFX942-NEXT:    s_add_u32 s2, s2, global at rel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s3, s3, global at rel32@hi+12
-; GFX942-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX942-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[0:1], -1
@@ -921,12 +902,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX942-NEXT:    s_branch .LBB5_10
 ; GFX942-NEXT:  .LBB5_3: ; %atomicrmw.check.private
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX942-NEXT:    s_mov_b32 s2, 32
-; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX942-NEXT:    s_getpc_b64 s[2:3]
 ; GFX942-NEXT:    s_add_u32 s2, s2, global at rel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s3, s3, global at rel32@hi+12
-; GFX942-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX942-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[0:1], -1
@@ -990,12 +969,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX1100-NEXT:    s_mov_b32 s2, 32
-; GFX1100-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX1100-NEXT:    s_getpc_b64 s[2:3]
 ; GFX1100-NEXT:    s_add_u32 s2, s2, global at rel32@lo+4
 ; GFX1100-NEXT:    s_addc_u32 s3, s3, global at rel32@hi+12
-; GFX1100-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1100-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1100-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX1100-NEXT:    s_mov_b32 s0, -1
@@ -1060,9 +1037,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX1200-NEXT:    s_wait_bvhcnt 0x0
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX1200-NEXT:    s_mov_b32 s2, 32
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX1200-NEXT:    s_getpc_b64 s[2:3]
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_sext_i32_i16 s3, s3
@@ -1070,7 +1044,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_add_co_ci_u32 s3, s3, global at rel32@hi+24
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1200-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1200-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
index 51caa84450ff3..583b6fe0a81ca 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -134,57 +134,57 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
 ;
 ; GFX9-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX9-NEXT:    s_cselect_b32 s0, s1, 0
-; GFX9-NEXT:    s_cselect_b32 s1, s2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
 ; GFX9-ARCH-FLAT:       ; %bb.0:
-; GFX9-ARCH-FLAT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9-ARCH-FLAT-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ARCH-FLAT-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX9-ARCH-FLAT-NEXT:    s_cselect_b32 s0, s1, 0
-; GFX9-ARCH-FLAT-NEXT:    s_cselect_b32 s1, s2, 0
-; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-ARCH-FLAT-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9-ARCH-FLAT-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9-ARCH-FLAT-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-ARCH-FLAT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX942-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
 ; GFX942-ARCH-FLAT:       ; %bb.0:
-; GFX942-ARCH-FLAT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-ARCH-FLAT-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-ARCH-FLAT-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX942-ARCH-FLAT-NEXT:    s_cselect_b32 s0, s1, 0
-; GFX942-ARCH-FLAT-NEXT:    s_cselect_b32 s1, s2, 0
-; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-ARCH-FLAT-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX942-ARCH-FLAT-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX942-ARCH-FLAT-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX942-ARCH-FLAT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX10-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX10-NEXT:    s_cselect_b32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
@@ -533,49 +533,49 @@ define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) noca
 ;
 ; GFX9-LABEL: private_constant_expression_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-ARCH-FLAT-LABEL: private_constant_expression_use:
 ; GFX9-ARCH-FLAT:       ; %bb.0:
-; GFX9-ARCH-FLAT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-ARCH-FLAT-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9-ARCH-FLAT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ARCH-FLAT-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-ARCH-FLAT-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX942-ARCH-FLAT-LABEL: private_constant_expression_use:
 ; GFX942-ARCH-FLAT:       ; %bb.0:
-; GFX942-ARCH-FLAT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-ARCH-FLAT-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX942-ARCH-FLAT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-ARCH-FLAT-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-ARCH-FLAT-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: private_constant_expression_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
   store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8
@@ -611,48 +611,48 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
 ;
 ; GFX9-LABEL: calls_intrin_ascast_cc_kernel:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel:
 ; GFX9-ARCH-FLAT:       ; %bb.0:
-; GFX9-ARCH-FLAT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9-ARCH-FLAT-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 7
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-ARCH-FLAT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX942-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel:
 ; GFX942-ARCH-FLAT:       ; %bb.0:
-; GFX942-ARCH-FLAT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-ARCH-FLAT-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 7
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-ARCH-FLAT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: calls_intrin_ascast_cc_kernel:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX10-NEXT:    s_mov_b64 s[0:1], src_shared_base
-; GFX10-NEXT:    v_mov_b32_e32 v2, 7
+; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, 7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index 31c23b94a8de8..66d99b14e282d 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -226,9 +226,8 @@ define void @private_alloca_to_flat(ptr %ptr) {
 ; GISEL-ASM-LABEL: private_alloca_to_flat:
 ; GISEL-ASM:       ; %bb.0:
 ; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GISEL-ASM-NEXT:    s_lshr_b32 s4, s32, 6
-; GISEL-ASM-NEXT:    s_mov_b64 s[6:7], src_private_base
-; GISEL-ASM-NEXT:    s_mov_b32 s5, s7
 ; GISEL-ASM-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
 ; GISEL-ASM-NEXT:    v_mov_b32_e32 v1, s5
@@ -330,21 +329,21 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
 ; DAGISEL-ASM-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; DAGISEL-ASM-NEXT:  ; %bb.2: ; %finallyendcf.split
 ; DAGISEL-ASM-NEXT:    s_or_b64 exec, exec, s[4:5]
-; DAGISEL-ASM-NEXT:    s_xor_b64 s[6:7], vcc, -1
-; DAGISEL-ASM-NEXT:    s_mov_b64 s[4:5], 0
-; DAGISEL-ASM-NEXT:    s_mov_b64 s[8:9], src_private_base
+; DAGISEL-ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
+; DAGISEL-ASM-NEXT:    s_xor_b64 s[8:9], vcc, -1
+; DAGISEL-ASM-NEXT:    s_mov_b64 s[6:7], 0
 ; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
 ; DAGISEL-ASM-NEXT:  .LBB11_3: ; %finally
 ; DAGISEL-ASM-NEXT:    ; =>This Inner Loop Header: Depth=1
-; DAGISEL-ASM-NEXT:    s_and_b64 s[10:11], exec, s[6:7]
-; DAGISEL-ASM-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v1, s9
+; DAGISEL-ASM-NEXT:    s_and_b64 s[10:11], exec, s[8:9]
+; DAGISEL-ASM-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v1, s5
 ; DAGISEL-ASM-NEXT:    flat_store_dword v[0:1], v2
 ; DAGISEL-ASM-NEXT:    s_waitcnt vmcnt(0)
-; DAGISEL-ASM-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; DAGISEL-ASM-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; DAGISEL-ASM-NEXT:    s_cbranch_execnz .LBB11_3
 ; DAGISEL-ASM-NEXT:  ; %bb.4: ; %end
-; DAGISEL-ASM-NEXT:    s_or_b64 exec, exec, s[4:5]
+; DAGISEL-ASM-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; DAGISEL-ASM-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-ASM-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index ee87c65c00def..1a4a54b81c78f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -254,8 +254,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -447,8 +447,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -846,8 +846,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -900,8 +900,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1070,8 +1070,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1124,8 +1124,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1288,8 +1288,8 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1479,8 +1479,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1535,8 +1535,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2016,8 +2016,8 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2070,8 +2070,8 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2579,8 +2579,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2772,8 +2772,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3171,8 +3171,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3225,8 +3225,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3395,8 +3395,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3449,8 +3449,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3613,8 +3613,8 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3804,8 +3804,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3860,8 +3860,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4024,8 +4024,8 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4215,8 +4215,8 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4271,8 +4271,8 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -6201,9 +6201,9 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6369,8 +6369,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6427,8 +6427,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6662,10 +6662,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6831,8 +6831,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6889,8 +6889,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -7552,9 +7552,9 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7715,8 +7715,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -7771,8 +7771,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -8001,10 +8001,10 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8165,8 +8165,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -8221,8 +8221,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index b8a2476dc19b4..59b0537b817d2 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -3255,9 +3255,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3397,8 +3397,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3438,8 +3438,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3667,10 +3667,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3810,8 +3810,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3851,8 +3851,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4465,9 +4465,9 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4605,8 +4605,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4645,8 +4645,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4870,10 +4870,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5011,8 +5011,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5051,8 +5051,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 9830e48a86f06..c9c9f332fe391 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -3255,9 +3255,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3397,8 +3397,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3438,8 +3438,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3667,10 +3667,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3810,8 +3810,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3851,8 +3851,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4465,9 +4465,9 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4605,8 +4605,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4645,8 +4645,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4870,10 +4870,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5011,8 +5011,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5051,8 +5051,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 2c1970220c374..587c2ea885077 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -3711,9 +3711,9 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3871,8 +3871,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3922,8 +3922,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4157,10 +4157,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4318,8 +4318,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4369,8 +4369,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5019,9 +5019,9 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5175,8 +5175,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5225,8 +5225,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5455,10 +5455,10 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5612,8 +5612,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5662,8 +5662,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index f8c2ddf0d7d3c..082050877e0bb 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -637,8 +637,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -824,11 +824,11 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -995,8 +995,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1028,9 +1028,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1150,11 +1149,11 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1497,8 +1496,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -1688,11 +1687,11 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -1869,8 +1868,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1905,9 +1904,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -2038,11 +2036,11 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -2393,8 +2391,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2586,11 +2584,11 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2769,8 +2767,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -2807,9 +2805,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -2940,11 +2937,11 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3299,8 +3296,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3492,11 +3489,11 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3675,8 +3672,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3712,9 +3709,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3846,11 +3842,11 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4203,8 +4199,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4396,11 +4392,11 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4579,8 +4575,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4616,9 +4612,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4750,11 +4745,11 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5107,8 +5102,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -5300,11 +5295,11 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -5483,8 +5478,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5520,9 +5515,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5654,11 +5648,11 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5971,8 +5965,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6156,11 +6150,11 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6333,8 +6327,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6370,9 +6364,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6496,11 +6489,11 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6813,8 +6806,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6998,11 +6991,11 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -7175,8 +7168,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7212,9 +7205,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7338,11 +7330,11 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7655,8 +7647,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -7840,11 +7832,11 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -8017,8 +8009,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8054,9 +8046,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8180,11 +8171,11 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8497,8 +8488,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -8682,11 +8673,11 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -8859,8 +8850,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8896,9 +8887,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -9022,11 +9012,11 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -9393,8 +9383,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, v2
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -9594,14 +9584,14 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, v2
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -9788,8 +9778,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, v2
@@ -9831,9 +9821,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -9969,11 +9958,11 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, v2
@@ -10299,8 +10288,8 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -10498,11 +10487,11 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -10683,8 +10672,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10720,9 +10709,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10852,11 +10840,11 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11180,8 +11168,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -11385,11 +11373,11 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -11576,8 +11564,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11614,9 +11602,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11751,11 +11738,11 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11841,8 +11828,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
@@ -11899,8 +11886,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x50
-; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -11958,8 +11945,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12006,8 +11993,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
 ; GFX950-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -12061,8 +12048,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
@@ -12121,8 +12108,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x50
-; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB111_6
@@ -12177,8 +12164,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12225,8 +12212,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
 ; GFX950-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX950-GISEL-NEXT:    s_cbranch_scc0 .LBB111_6
@@ -12355,8 +12342,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12388,8 +12375,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s2, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, s5
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -12506,8 +12493,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12540,8 +12527,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s2, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, s5
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX950-GISEL-NEXT:    s_cbranch_scc0 .LBB113_2
@@ -12656,8 +12643,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12689,8 +12676,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s2, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, s5
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -12807,8 +12794,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12841,8 +12828,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s2, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, s5
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX950-GISEL-NEXT:    s_cbranch_scc0 .LBB115_2

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index d9a596283db1e..1f105e8dd8ba5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -106,11 +106,11 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_add_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -401,17 +401,17 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_add_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB2_3
@@ -555,15 +555,15 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB3_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -696,9 +696,9 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_add_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -979,15 +979,16 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_add_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB6_3
@@ -1267,11 +1268,11 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_and_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -1556,17 +1557,17 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_and_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB10_3
@@ -1707,15 +1708,15 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB11_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1845,9 +1846,9 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_and_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -2122,15 +2123,16 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_and_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB14_3
@@ -2408,11 +2410,11 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_sub_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -2703,17 +2705,17 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_sub_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB18_3
@@ -2857,15 +2859,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB19_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -2998,9 +3000,9 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_sub_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -3281,15 +3283,16 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_sub_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB22_3
@@ -3571,11 +3574,11 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_max_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -3868,17 +3871,17 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_max_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB26_3
@@ -4024,15 +4027,15 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB27_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -4165,9 +4168,9 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_max_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -4450,15 +4453,16 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_max_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB30_3
@@ -4742,11 +4746,11 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umax_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -5039,17 +5043,17 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-LABEL: atomic_umax_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB34_3
@@ -5195,15 +5199,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB35_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -5336,9 +5340,9 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umax_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -5621,15 +5625,16 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_umax_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB38_3
@@ -5913,11 +5918,11 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_min_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -6210,17 +6215,17 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_min_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB42_3
@@ -6366,15 +6371,15 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB43_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -6507,9 +6512,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_min_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -6792,15 +6797,16 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_min_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB46_3
@@ -7084,11 +7090,11 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umin_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -7381,17 +7387,17 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-LABEL: atomic_umin_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB50_3
@@ -7537,15 +7543,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB51_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -7678,9 +7684,9 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umin_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -7963,15 +7969,16 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_umin_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB54_3
@@ -8253,11 +8260,11 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_or_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -8542,17 +8549,17 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GFX12-LABEL: atomic_or_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB58_3
@@ -8693,15 +8700,15 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB59_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -8831,9 +8838,9 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_or_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -9108,15 +9115,16 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GFX12-LABEL: atomic_or_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB62_3
@@ -9384,11 +9392,11 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xchg_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -9510,11 +9518,11 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GFX12-LABEL: atomic_xchg_f64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -9636,11 +9644,11 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GFX12-LABEL: atomic_xchg_pointer_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -9911,17 +9919,17 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB68_3
@@ -10057,15 +10065,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB69_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -10186,9 +10194,9 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xchg_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -10449,15 +10457,16 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_xchg_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB72_3
@@ -10727,11 +10736,11 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xor_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -11016,17 +11025,17 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_xor_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB76_3
@@ -11167,15 +11176,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB77_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -11305,9 +11314,9 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xor_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -11582,15 +11591,16 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_xor_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB80_3
@@ -12745,15 +12755,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
 ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB93_3
@@ -12908,14 +12918,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[2:3], s[14:15], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[8:9], s[2:3]
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_vccz .LBB94_2
@@ -13510,12 +13520,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[2:3], s[14:15], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[8:9], s[2:3]
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -14071,11 +14081,11 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_inc_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -14380,17 +14390,17 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_inc_i64_incr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB109_3
@@ -14541,15 +14551,15 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB110_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -14689,9 +14699,9 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_inc_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -14986,15 +14996,16 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_inc_i64_incr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB113_3
@@ -15296,11 +15307,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_dec_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -15625,17 +15636,17 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_dec_i64_decr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB117_3
@@ -15797,15 +15808,15 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB118_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -15954,9 +15965,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_dec_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -16271,15 +16282,16 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_dec_i64_decr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB121_3

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 524100c5b7a25..9e27f6badfdac 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -187,8 +187,8 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -422,8 +422,8 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -660,8 +660,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -897,8 +897,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -1010,8 +1010,8 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1126,8 +1126,8 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1347,8 +1347,8 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1582,8 +1582,8 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1820,8 +1820,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -2057,8 +2057,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -2170,8 +2170,8 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2286,8 +2286,8 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2537,8 +2537,8 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2797,8 +2797,8 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3063,8 +3063,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -3320,8 +3320,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -3448,8 +3448,8 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3578,8 +3578,8 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3924,8 +3924,8 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4284,8 +4284,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4656,8 +4656,8 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -5017,8 +5017,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -5159,8 +5159,8 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5289,8 +5289,8 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5635,8 +5635,8 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5995,8 +5995,8 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6357,8 +6357,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -6706,8 +6706,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -6846,8 +6846,8 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -6976,8 +6976,8 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -7342,8 +7342,8 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -7726,8 +7726,8 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -8112,8 +8112,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -8485,8 +8485,8 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -8675,8 +8675,8 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -8871,8 +8871,8 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -9237,8 +9237,8 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -9597,8 +9597,8 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -9959,8 +9959,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -10308,8 +10308,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -10448,8 +10448,8 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -10578,8 +10578,8 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -10924,8 +10924,8 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -11284,8 +11284,8 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -11646,8 +11646,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -11995,8 +11995,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -12135,8 +12135,8 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12265,8 +12265,8 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -12611,8 +12611,8 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12971,8 +12971,8 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -13358,8 +13358,8 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -13737,8 +13737,8 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -13939,20 +13939,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN3:       ; %bb.0: ; %entry
 ; GCN3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_mov_b32 s14, -1
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    s_add_u32 s0, s0, 32
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GCN3-NEXT:    s_addc_u32 s1, s1, 0
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -14368,18 +14368,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN3:       ; %bb.0: ; %entry
 ; GCN3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_mov_b32 s14, -1
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -14734,8 +14734,8 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -14864,8 +14864,8 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -15210,8 +15210,8 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -15570,8 +15570,8 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -15957,8 +15957,8 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -16336,8 +16336,8 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -16538,20 +16538,20 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN3:       ; %bb.0: ; %entry
 ; GCN3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_mov_b32 s14, -1
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    s_add_u32 s0, s0, 32
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GCN3-NEXT:    s_addc_u32 s1, s1, 0
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -17124,8 +17124,8 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -17254,8 +17254,8 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -17600,8 +17600,8 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -17960,8 +17960,8 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -18347,8 +18347,8 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -18726,8 +18726,8 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -18871,8 +18871,8 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -19001,8 +19001,8 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -19347,8 +19347,8 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -19707,8 +19707,8 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -20094,8 +20094,8 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -20473,8 +20473,8 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -20675,20 +20675,20 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN3:       ; %bb.0: ; %entry
 ; GCN3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_mov_b32 s14, -1
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    s_add_u32 s0, s0, 32
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GCN3-NEXT:    s_addc_u32 s1, s1, 0
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -21101,9 +21101,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -21457,8 +21457,8 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -21587,8 +21587,8 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -21958,8 +21958,8 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -22348,8 +22348,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -22745,8 +22745,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -23130,8 +23130,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -23282,8 +23282,8 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -23421,8 +23421,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -23810,8 +23810,8 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
@@ -24223,8 +24223,8 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -24659,8 +24659,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s38, s4, 32
-; GCN3-NEXT:    s_addc_u32 s39, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
+; GCN3-NEXT:    s_addc_u32 s39, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s39, s35
 ; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
@@ -25091,8 +25091,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s38, s4, 32
-; GCN3-NEXT:    s_addc_u32 s39, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
+; GCN3-NEXT:    s_addc_u32 s39, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s39, s35
 ; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
@@ -25253,8 +25253,8 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -25397,8 +25397,8 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
index 9db760077a853..872c2cf569dcc 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
@@ -6,21 +6,21 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; CHECK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; CHECK-NEXT:    s_load_dword s6, s[4:5], 0x24
+; CHECK-NEXT:    s_load_dword s8, s[4:5], 0x24
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; CHECK-NEXT:    s_mov_b32 s14, -1
 ; CHECK-NEXT:    s_mov_b32 s15, 0xe00000
 ; CHECK-NEXT:    s_add_u32 s12, s12, s11
 ; CHECK-NEXT:    s_addc_u32 s13, s13, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
-; CHECK-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; CHECK-NEXT:    s_add_u32 s0, s0, s6
-; CHECK-NEXT:    s_addc_u32 s1, s1, s7
+; CHECK-NEXT:    s_ashr_i32 s9, s8, 31
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; CHECK-NEXT:    s_add_u32 s0, s0, s4
+; CHECK-NEXT:    s_addc_u32 s1, s1, s5
 ; CHECK-NEXT:    s_add_u32 s0, s0, -8
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; CHECK-NEXT:    s_mov_b64 s[6:7], src_shared_base
 ; CHECK-NEXT:    s_addc_u32 s1, s1, -1
-; CHECK-NEXT:    s_cmp_eq_u32 s1, s5
+; CHECK-NEXT:    s_cmp_eq_u32 s1, s7
 ; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], -1

diff  --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index ec80efc5f0362..2daed9b69384f 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -56,19 +56,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
-; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX9V4-NEXT:    s_cselect_b32 s2, s3, 0
-; GFX9V4-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9V4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9V4-NEXT:    s_cselect_b32 s0, s5, 0
-; GFX9V4-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V4-NEXT:    s_cmp_lg_u32 s4, -1
+; GFX9V4-NEXT:    s_cselect_b32 s0, s1, 0
+; GFX9V4-NEXT:    s_cselect_b32 s1, s4, 0
+; GFX9V4-NEXT:    s_cmp_lg_u32 s5, -1
+; GFX9V4-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9V4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9V4-NEXT:    s_cselect_b32 s0, s3, 0
+; GFX9V4-NEXT:    s_cselect_b32 s1, s5, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v4
@@ -80,19 +80,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
-; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX9V5-NEXT:    s_cselect_b32 s2, s3, 0
-; GFX9V5-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9V5-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9V5-NEXT:    s_cselect_b32 s0, s5, 0
-; GFX9V5-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V5-NEXT:    s_cmp_lg_u32 s4, -1
+; GFX9V5-NEXT:    s_cselect_b32 s0, s1, 0
+; GFX9V5-NEXT:    s_cselect_b32 s1, s4, 0
+; GFX9V5-NEXT:    s_cmp_lg_u32 s5, -1
+; GFX9V5-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9V5-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9V5-NEXT:    s_cselect_b32 s0, s3, 0
+; GFX9V5-NEXT:    s_cselect_b32 s1, s5, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v4
@@ -136,10 +136,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9V4-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V4-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
@@ -148,10 +148,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9V5-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V5-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
@@ -190,10 +190,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_is_private:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V4-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V4-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
@@ -202,10 +202,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_is_private:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V5-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V5-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 9e1815b48abfd..0d6f84edcb487 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -46,12 +46,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX9-LABEL: is_private_vgpr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
@@ -79,13 +79,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX10-LABEL: is_private_vgpr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    global_store_dword v[0:1], v0, off
@@ -93,14 +92,14 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX11-LABEL: is_private_vgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
@@ -156,10 +155,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX9-SDAG-LABEL: is_private_sgpr:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9-SDAG-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
@@ -190,10 +189,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX9-GISEL-LABEL: is_private_sgpr:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -204,10 +203,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX10-LABEL: is_private_sgpr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX10-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -218,10 +217,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX11-LABEL: is_private_sgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX11-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index f1dcc93172fb1..63333ed165a32 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -81,12 +81,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX9-LABEL: is_local_vgpr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
@@ -94,15 +94,14 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX1250-LABEL: is_local_vgpr:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT:    global_load_b64 v[0:1], v0, s[2:3] scale_offset scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_mov_b64 s[0:1], src_shared_base
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX1250-NEXT:    s_endpgm
@@ -129,13 +128,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX10-LABEL: is_local_vgpr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    global_store_dword v[0:1], v0, off
@@ -143,14 +141,14 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX11-LABEL: is_local_vgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
@@ -240,10 +238,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX9-SDAG-LABEL: is_local_sgpr:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9-SDAG-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
@@ -256,10 +254,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX1250-SDAG-LABEL: is_local_sgpr:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -291,10 +289,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX9-GISEL-LABEL: is_local_sgpr:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -305,10 +303,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX10-LABEL: is_local_sgpr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX10-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -319,10 +317,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX11-LABEL: is_local_sgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX11-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -333,10 +331,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX1250-GISEL-LABEL: is_local_sgpr:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX1250-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index 8867e6102406b..d2008be4fd32a 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -10,17 +10,17 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; CHECK-NEXT:    flat_load_dword v42, v[46:47]
-; CHECK-NEXT:    s_load_dwordx4 s[64:67], s[8:9], 0x8
-; CHECK-NEXT:    s_load_dword s68, s[8:9], 0x0
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
+; CHECK-NEXT:    s_load_dwordx4 s[64:67], s[34:35], 0x8
+; CHECK-NEXT:    s_load_dword s68, s[34:35], 0x0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s68, -1
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    s_cselect_b32 s5, s5, 0
+; CHECK-NEXT:    s_cselect_b32 s5, s9, 0
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; CHECK-NEXT:    s_cselect_b32 s6, s68, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v57, s5

diff  --git a/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll
index 8b54f709eec7a..1f13282a1f04c 100644
--- a/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll
+++ b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll
@@ -14,17 +14,15 @@ define void @_Z12lane_pc_testj() #0 !dbg !9 {
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  ; %bb.1: ; %lab
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:  .Ltmp0:
 ; GCN-NEXT:    .loc 0 12 1 prologue_end ; t.cpp:12:1
-; GCN-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GCN-NEXT:    s_mov_b32 s6, 32
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
-; GCN-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-NEXT:    s_mov_b32 s5, -1
+; GCN-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_lshr_b32 s8, s32, 5
-; GCN-NEXT:    s_cmp_lg_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s5, s4, s7
-; GCN-NEXT:    s_cselect_b32 s4, s8, s6
+; GCN-NEXT:    s_cmp_lg_u32 s8, s6
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cselect_b32 s4, s8, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    .loc 0 13 1 ; t.cpp:13:1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4


        


More information about the llvm-commits mailing list